Mercurial > repos > proteore > proteore_data_manager
annotate data_manager/resource_building.py @ 53:bb552aa4b9ac draft
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
| author | proteore |
|---|---|
| date | Fri, 05 Jun 2020 13:49:53 +0000 |
| parents | 55b12ec24a9f |
| children | 109fc5236204 |
| rev | line source |
|---|---|
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
1 # -*- coding: utf-8 -*- |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
2 """ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
3 The purpose of this script is to create source files from different databases to be used in other proteore tools |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
4 """ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
5 |
|
42
f6a6a70712c4
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
41
diff
changeset
|
6 import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
7 from io import BytesIO |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
8 from zipfile import ZipFile |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
9 from galaxy.util.json import from_json_string, to_json_string |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
10 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
11 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
12 # General functions |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
13 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
14 def unzip(url, output_file): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
15 """ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
16 Get a zip file content from a link and unzip |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
17 """ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
18 content = requests.get(url) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
19 zipfile = ZipFile(BytesIO(content.content)) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
20 output_content = "" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
21 output_content += zipfile.open(zipfile.namelist()[0]).read() |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
22 output = open(output_file, "w") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
23 output.write(output_content) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
24 output.close() |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
25 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
26 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
27 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
28 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
29 data_manager_dict['data_tables'][data_table].append(data_table_entry) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
30 return data_manager_dict |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
31 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
32 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
33 # 1. Human Protein Atlas |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
34 # - Normal tissue |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
35 # - Pathology |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
36 # - Full Atlas |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
37 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
38 def HPA_sources(data_manager_dict, tissue, target_directory): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
39 if tissue == "HPA_normal_tissue": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
40 tissue_name = "HPA normal tissue" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
41 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip" |
|
1
0915249b8c4b
planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents:
0
diff
changeset
|
42 table = "proteore_protein_atlas_normal_tissue" |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
43 elif tissue == "HPA_pathology": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
44 tissue_name = "HPA pathology" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
45 url = "https://www.proteinatlas.org/download/pathology.tsv.zip" |
|
1
0915249b8c4b
planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents:
0
diff
changeset
|
46 table = "proteore_protein_atlas_tumor_tissue" |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
47 elif tissue == "HPA_full_atlas": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
48 tissue_name = "HPA full atlas" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
49 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip" |
|
1
0915249b8c4b
planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents:
0
diff
changeset
|
50 table = "proteore_protein_full_atlas" |
|
48
09d9fd9b2d8c
"planemo upload commit 714703437f238e7295980f96620365b801fd435c-dirty"
proteore
parents:
45
diff
changeset
|
51 elif tissue == "HPA_RNA_tissue": |
|
09d9fd9b2d8c
"planemo upload commit 714703437f238e7295980f96620365b801fd435c-dirty"
proteore
parents:
45
diff
changeset
|
52 tissue_name = "HPA RNA tissue" |
|
50
7a6d8aafb269
"planemo upload commit 20b2ab253fe8386e248493316760a3fb5ccc309d-dirty"
proteore
parents:
49
diff
changeset
|
53 url = "https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip" |
|
48
09d9fd9b2d8c
"planemo upload commit 714703437f238e7295980f96620365b801fd435c-dirty"
proteore
parents:
45
diff
changeset
|
54 table = "proteore_protein_atlas_rna_tissue" |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
55 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
56 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
57 path = os.path.join(target_directory, output_file) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
58 unzip(url, path) #download and save file |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
59 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y") |
|
26
bf6940ff60a8
planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents:
24
diff
changeset
|
60 release = tissue_name.replace(" ","_").replace("/","-") |
|
bf6940ff60a8
planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents:
24
diff
changeset
|
61 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
62 |
|
1
0915249b8c4b
planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents:
0
diff
changeset
|
63 |
|
26
bf6940ff60a8
planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents:
24
diff
changeset
|
64 data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path) |
|
1
0915249b8c4b
planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents:
0
diff
changeset
|
65 _add_data_table_entry(data_manager_dict, data_table_entry, table) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
66 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
67 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
68 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
69 # 2. Peptide Atlas |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
70 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
71 def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
72 # Define organism_id (here Human) - to be upraded when other organism added to the project |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
73 organism_id = "2" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
74 # Extract sample_category_id and output filename |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
75 tissue=tissue.split(".") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
76 sample_category_id = tissue[0] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
77 tissue_name = tissue[1] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
78 output_file = tissue_name+"_"+date + ".tsv" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
79 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
80 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
81 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
82 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
83 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
84 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
85 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
86 download = s.get(query) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
87 decoded_content = download.content.decode('utf-8') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
88 cr = csv.reader(decoded_content.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
89 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
90 uni_dict = build_dictionary(cr) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
91 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
92 #columns of data table peptide_atlas |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
93 tissue_id = tissue_name+"_"+date |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
94 name = tissue_id.replace("-","/").replace("_"," ") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
95 path = os.path.join(target_directory,output_file) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
96 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
97 with open(path,"w") as out : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
98 w = csv.writer(out,delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
99 w.writerow(["Uniprot_AC","nb_obs"]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
100 w.writerows(uni_dict.items()) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
101 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
102 data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
103 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
104 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
105 #function to count the number of observations by uniprot id |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
106 def build_dictionary (csv) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
107 uni_dict = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
108 for line in csv : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
109 if "-" not in line[0] and check_uniprot_access(line[0]) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
110 if line[0] in uni_dict : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
111 uni_dict[line[0]] += int(line[5]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
112 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
113 uni_dict[line[0]] = int(line[5]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
114 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
115 return uni_dict |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
116 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
117 #function to check if an id is an uniprot accession number : return True or False- |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
118 def check_uniprot_access (id) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
119 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
120 if uniprot_pattern.match(id) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
121 return True |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
122 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
123 return False |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
124 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
125 def check_entrez_geneid (id) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
126 entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
127 if entrez_pattern.match(id) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
128 return True |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
129 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
130 return False |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
131 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
132 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
133 # 3. ID mapping file |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
134 ####################################################################################################### |
|
51
55b12ec24a9f
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
50
diff
changeset
|
135 import ftplib, gzip |
|
55b12ec24a9f
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
50
diff
changeset
|
136 from io import StringIO |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
137 csv.field_size_limit(sys.maxsize) # to handle big files |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
138 |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
139 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
140 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
141 human = species == "Human" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
142 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
143 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
|
36
b18cb0d11971
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
35
diff
changeset
|
144 archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d"))) |
|
38
6daab0a711e2
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
37
diff
changeset
|
145 if os.path.isdir(archive) is False : os.mkdir(archive) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
146 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
147 #header |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
148 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] |
|
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
149 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
150 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
151 #get selected.tab and keep only ids of interest |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
152 selected_tab_file=species_dict[species]+"_"+files[0] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
153 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
154 with gzip.open(tab_path,"rt") as select : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
155 tab_reader = csv.reader(select,delimiter="\t") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
156 for line in tab_reader : |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
157 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
|
41
fe21769281fa
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
40
diff
changeset
|
158 if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1])) |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
159 shutil.move(tab_path, archive) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
160 #print("selected_tab ok") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
161 |
|
20
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
162 #get uniprot-AC reviewed |
|
22
4296fb613d9d
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
21
diff
changeset
|
163 organism = species_dict[species].split("_")[1] |
|
4296fb613d9d
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
21
diff
changeset
|
164 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" |
|
20
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
165 |
|
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
166 with requests.Session() as s: |
|
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
167 download = s.get(query) |
|
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
168 decoded_content = download.content.decode('utf-8') |
|
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
169 uniprot_reviewed_list = decoded_content.splitlines() |
|
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
170 |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
171 #save reviewed list |
|
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
172 reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt') |
|
34
06b3181206b4
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
33
diff
changeset
|
173 with open(reviewed_list_path,'w') as reviewed_list_file: |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
174 for id in uniprot_reviewed_list: |
|
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
175 reviewed_list_file.write(id+"\n") |
|
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
176 |
|
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
177 #remove unreviewed uniprot-AC |
|
20
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
178 for line in tab[1:]: |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
179 UniProtAC = line[1] |
|
28
808c8493ed4f
planemo upload commit 5c2c274361c0daceae1f678eca0e6c0d5b4ba4f7-dirty
proteore
parents:
27
diff
changeset
|
180 if UniProtAC not in uniprot_reviewed_list : |
|
29
9a40b72414de
planemo upload commit 3c02acc07df862410ce979ce63e3d14f10edc50f-dirty
proteore
parents:
28
diff
changeset
|
181 line[1]="" |
|
20
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
182 |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
183 """ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
184 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
185 -NextProt,BioGrid,STRING,KEGG |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
186 """ |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
187 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
188 #there's more id type for human |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
189 if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file |
|
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
190 else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ] |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
191 unidict = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
192 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
193 #keep only ids of interest in dictionaries |
|
40
a7bcc9a0a456
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
39
diff
changeset
|
194 dat_file = species_dict[species]+"_"+files[1] |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
195 dat_path = download_from_uniprot_ftp(dat_file,target_directory) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
196 with gzip.open(dat_path,"rt") as dat : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
197 dat_reader = csv.reader(dat,delimiter="\t") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
198 for line in dat_reader : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
199 uniprotID=line[0] #UniProtID as key |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
200 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
201 cor_id=line[2] #corresponding id |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
202 if "-" not in id_type : #we don't keep isoform |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
203 if id_type in ids and uniprotID in unidict : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
204 if id_type in unidict[uniprotID] : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
205 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
206 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
207 unidict[uniprotID].update({ id_type : cor_id }) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
208 elif id_type in ids : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
209 unidict[uniprotID]={id_type : cor_id} |
|
41
fe21769281fa
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
40
diff
changeset
|
210 if os.path.exists(os.path.join(archive,dat_path.split("/")[-1])) : os.remove(os.path.join(archive,dat_path.split("/")[-1])) |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
211 shutil.move(dat_path, archive) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
212 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
213 #print("dat_file ok") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
214 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
215 #add ids from idmapping.dat to the final tab |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
216 for line in tab[1:] : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
217 uniprotID=line[0] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
218 if human : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
219 if uniprotID in unidict : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
220 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
221 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
222 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
223 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
224 else : |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
225 line.extend(["","","","",""]) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
226 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
227 if uniprotID in unidict : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
228 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
229 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
230 else : |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
231 line.extend(["","","",""]) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
232 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
233 #print ("tab ok") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
234 |
|
20
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
235 #add missing nextprot ID for human or replace old ones |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
236 if human : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
237 #build next_dict |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
238 nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) |
|
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
239 with open(nextprot_path,'r') as nextprot_ids : |
|
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
240 nextprot_ids = nextprot_ids.read().splitlines() |
|
41
fe21769281fa
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
40
diff
changeset
|
241 if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1])) |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
242 shutil.move(nextprot_path,archive) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
243 next_dict = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
244 for nextid in nextprot_ids : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
245 next_dict[nextid.replace("NX_","")] = nextid |
|
45
488e015576bc
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
44
diff
changeset
|
246 #os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
247 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
248 #add missing nextprot ID |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
249 for line in tab[1:] : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
250 uniprotID=line[0] |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
251 nextprotID=line[14] |
|
20
29cf75c83618
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
16
diff
changeset
|
252 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : |
|
30
a6cabd3ab71f
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents:
29
diff
changeset
|
253 line[14]=next_dict[uniprotID] |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
254 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
255 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
256 path = os.path.join(target_directory,output_file) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
257 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
258 with open(path,"w") as out : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
259 w = csv.writer(out,delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
260 w.writerows(tab) |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
261 |
|
36
b18cb0d11971
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
35
diff
changeset
|
262 subprocess.call(['tar', '-czvf', archive+".tar.gz", archive]) |
|
44
7ebbf851598e
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
43
diff
changeset
|
263 shutil.rmtree(archive, ignore_errors=True) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
264 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
265 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
266 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" |
|
21
026177e4ff4b
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
20
diff
changeset
|
267 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") |
|
23
ebdd4961c6c2
planemo upload commit 03015f58ac7e7ba3cc44ba0d8899eacfaaf5a134-dirty
proteore
parents:
22
diff
changeset
|
268 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
269 |
|
21
026177e4ff4b
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents:
20
diff
changeset
|
270 data_table_entry = dict(id=id, release=release , name = name, species = species, value = path) |
|
2
2e34ee6d2d37
planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents:
1
diff
changeset
|
271 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
272 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
273 def download_from_uniprot_ftp(file,target_directory) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
274 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
275 path = os.path.join(target_directory, file) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
276 ftp = ftplib.FTP("ftp.uniprot.org") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
277 ftp.login("anonymous", "anonymous") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
278 ftp.cwd(ftp_dir) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
279 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
280 ftp.quit() |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
281 return (path) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
282 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
283 def id_list_from_nextprot_ftp(file,target_directory) : |
|
53
bb552aa4b9ac
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
51
diff
changeset
|
284 ftp_dir = "pub/current_release/ac_lists/" |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
285 path = os.path.join(target_directory, file) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
286 ftp = ftplib.FTP("ftp.nextprot.org") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
287 ftp.login("anonymous", "anonymous") |
|
53
bb552aa4b9ac
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
51
diff
changeset
|
288 ftp.cwd(ftp_dir) |
|
bb552aa4b9ac
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
51
diff
changeset
|
289 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
290 ftp.quit() |
|
53
bb552aa4b9ac
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
51
diff
changeset
|
291 with open(path,'r') as nextprot_ids : |
|
bb552aa4b9ac
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
51
diff
changeset
|
292 nextprot_ids = nextprot_ids.read().splitlines() |
|
bb552aa4b9ac
"planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents:
51
diff
changeset
|
293 return (nextprot_ids) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
294 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
295 #return '' if there's no value in a dictionary, avoid error |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
296 def access_dictionary (dico,key1,key2) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
297 if key1 in dico : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
298 if key2 in dico[key1] : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
299 return (dico[key1][key2]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
300 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
301 return ("") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
302 #print (key2,"not in ",dico,"[",key1,"]") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
303 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
304 return ('') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
305 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
306 #if there are several nextprot ID for one uniprotID, return the uniprot like ID |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
307 def clean_nextprot_id (next_id,uniprotAc) : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
308 if len(next_id.split(";")) > 1 : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
309 tmp = next_id.split(";") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
310 if "NX_"+uniprotAc in tmp : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
311 return ("NX_"+uniprotAc) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
312 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
313 return (tmp[1]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
314 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
315 return (next_id) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
316 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
317 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
318 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
319 # 4. Build protein interaction maps files |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
320 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
321 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
322 def get_interactant_name(line,dico): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
323 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
324 if line[0] in dico : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
325 interactant_A = dico[line[0]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
326 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
327 interactant_A = "NA" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
328 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
329 if line[1] in dico : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
330 interactant_B = dico[line[1]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
331 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
332 interactant_B = "NA" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
333 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
334 return interactant_A, interactant_B |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
335 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
336 def PPI_ref_files(data_manager_dict, species, interactome, target_directory): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
337 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
338 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
339 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
340 ##BioGRID |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
341 if interactome=="biogrid": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
342 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
343 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
344 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
345 #download zip file |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
346 r = requests.get(tab2_link) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
347 with open("BioGRID.zip", "wb") as code: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
348 code.write(r.content) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
349 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
350 #unzip files |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
351 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
352 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
353 zip_ref.extractall("tmp_BioGRID") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
354 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
355 #import file of interest and build dictionary |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
356 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
357 with open(file_path,"r") as handle : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
358 tab_file = csv.reader(handle,delimiter="\t") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
359 dico_network = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
360 GeneID_index=1 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
361 network_cols=[1,2,7,8,11,12,14,18,20] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
362 for line in tab_file : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
363 if line[GeneID_index] not in dico_network: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
364 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
365 else: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
366 dico_network[line[GeneID_index]].append([line[i] for i in network_cols]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
367 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
368 #delete tmp_BioGRID directory |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
369 os.remove("BioGRID.zip") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
370 shutil.rmtree("tmp_BioGRID", ignore_errors=True) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
371 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
372 #download NCBI2Reactome.txt file and build dictionary |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
373 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
374 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
375 r.encoding ="utf-8" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
376 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
377 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
378 dico_nodes = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
379 geneid_index=0 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
380 pathway_description_index=3 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
381 species_index=5 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
382 for line in tab_file : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
383 if line[species_index]==species_dict[species]: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
384 if line[geneid_index] in dico_nodes : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
385 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
386 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
387 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
388 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
389 dico={} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
390 dico['network']=dico_network |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
391 dico['nodes']=dico_nodes |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
392 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
393 ##Bioplex |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
394 elif interactome=="bioplex": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
395 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
396 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
397 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
398 r = r.content.decode('utf-8') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
399 bioplex = csv.reader(r.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
400 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
401 dico_network = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
402 dico_network["GeneID"]={} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
403 network_geneid_cols=[0,1,4,5,8] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
404 dico_network["UniProt-AC"]={} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
405 network_uniprot_cols=[2,3,4,5,8] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
406 dico_GeneID_to_UniProt = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
407 for line in bioplex : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
408 if line[0] not in dico_network["GeneID"]: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
409 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
410 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
411 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
412 if line[1] not in dico_network["UniProt-AC"]: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
413 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
414 else: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
415 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
416 dico_GeneID_to_UniProt[line[0]]=line[2] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
417 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
418 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
419 r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
420 r.encoding ="utf-8" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
421 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
422 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
423 dico_nodes_uniprot = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
424 uniProt_index=0 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
425 pathway_description_index=3 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
426 species_index=5 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
427 for line in tab_file : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
428 if line[species_index]==species_dict[species]: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
429 if line[uniProt_index] in dico_nodes_uniprot : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
430 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
431 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
432 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
433 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
434 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
435 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
436 r.encoding ="utf-8" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
437 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
438 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
439 dico_nodes_geneid = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
440 geneid_index=0 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
441 pathway_description_index=3 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
442 species_index=5 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
443 for line in tab_file : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
444 if line[species_index]==species_dict[species]: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
445 if line[geneid_index] in dico_nodes_geneid : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
446 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
447 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
448 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
449 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
450 dico={} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
451 dico_nodes={} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
452 dico_nodes['GeneID']=dico_nodes_geneid |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
453 dico_nodes['UniProt-AC']=dico_nodes_uniprot |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
454 dico['network']=dico_network |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
455 dico['nodes']=dico_nodes |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
456 dico['convert']=dico_GeneID_to_UniProt |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
457 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
458 ##Humap |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
459 elif interactome=="humap": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
460 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
461 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
462 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
463 r = r.content.decode('utf-8') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
464 humap_nodes = csv.reader(r.splitlines(), delimiter=',') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
465 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
466 dico_geneid_to_gene_name={} |
|
9
cdd29444e0af
planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents:
8
diff
changeset
|
467 dico_protein_name={} |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
468 for line in humap_nodes : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
469 if check_entrez_geneid(line[4]): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
470 if line[4] not in dico_geneid_to_gene_name: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
471 dico_geneid_to_gene_name[line[4]]=line[3] |
|
9
cdd29444e0af
planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents:
8
diff
changeset
|
472 if line[4] not in dico_protein_name: |
|
cdd29444e0af
planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents:
8
diff
changeset
|
473 dico_protein_name[line[4]]=line[5] |
|
cdd29444e0af
planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents:
8
diff
changeset
|
474 |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
475 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
476 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
477 r = r.content.decode('utf-8') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
478 humap = csv.reader(r.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
479 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
480 dico_network = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
481 for line in humap : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
482 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
483 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
484 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
485 |
|
4
ce01295cd601
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
3
diff
changeset
|
486 #first interactant (first column) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
487 if line[0] not in dico_network: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
488 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
489 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
490 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
491 |
|
4
ce01295cd601
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
3
diff
changeset
|
492 #second interactant (second column) |
|
ce01295cd601
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
3
diff
changeset
|
493 if line[1] not in dico_network: |
|
5
429e7481c392
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
4
diff
changeset
|
494 dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]] |
|
4
ce01295cd601
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
3
diff
changeset
|
495 else : |
|
5
429e7481c392
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
4
diff
changeset
|
496 dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]]) |
|
4
ce01295cd601
planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents:
3
diff
changeset
|
497 |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
498 with requests.Session() as s: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
499 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
500 r.encoding ="utf-8" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
501 tab_file = csv.reader(r.content.splitlines(), delimiter='\t') |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
502 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
503 dico_nodes = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
504 geneid_index=0 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
505 pathway_description_index=3 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
506 species_index=5 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
507 for line in tab_file : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
508 if line[species_index]==species_dict[species]: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
509 #Fill dictionary with pathways |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
510 if line[geneid_index] in dico_nodes : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
511 dico_nodes[line[geneid_index]].append(line[pathway_description_index]) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
512 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
513 dico_nodes[line[geneid_index]] = [line[pathway_description_index]] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
514 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
515 dico={} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
516 dico['network']=dico_network |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
517 dico['nodes']=dico_nodes |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
518 dico['gene_name']=dico_geneid_to_gene_name |
|
9
cdd29444e0af
planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents:
8
diff
changeset
|
519 dico['protein_name']=dico_protein_name |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
520 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
521 #writing output |
|
27
9a400ce8e4e6
planemo upload commit bb113d19d3a756f70784e6a1433902888686ed96-dirty
proteore
parents:
26
diff
changeset
|
522 output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json" |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
523 path = os.path.join(target_directory,output_file) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
524 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y") |
|
27
9a400ce8e4e6
planemo upload commit bb113d19d3a756f70784e6a1433902888686ed96-dirty
proteore
parents:
26
diff
changeset
|
525 release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d") |
|
26
bf6940ff60a8
planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents:
24
diff
changeset
|
526 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
527 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
528 with open(path, 'w') as handle: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
529 json.dump(dico, handle, sort_keys=True) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
530 |
|
26
bf6940ff60a8
planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents:
24
diff
changeset
|
531 data_table_entry = dict(id=id, release=release, name = name, species = species, value = path) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
532 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
533 |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
534 ####################################################################################################### |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
535 # 5. nextprot (add protein features) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
536 ####################################################################################################### |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
537 |
|
8
d5badf9de1b0
planemo upload commit 968cd5b4f78f0a1da86fc3bc29f8159f86e199aa-dirty
proteore
parents:
6
diff
changeset
|
538 def Build_nextprot_ref_file(data_manager_dict,target_directory): |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
539 nextprot_ids_file = "nextprot_ac_list_all.txt" |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
540 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) |
|
12
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
541 |
|
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
542 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" |
|
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
543 path = os.path.join(target_directory,output_file) |
|
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
544 name = "neXtProt release "+time.strftime("%d-%m-%Y") |
|
16
f75c525e0a4a
planemo upload commit 04868e380c43447fac1309fc292785d67863a87b-dirty
proteore
parents:
15
diff
changeset
|
545 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") |
|
12
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
546 |
|
13
098693479a9d
planemo upload commit 64366857c1f29dd79a77dc331eb0153cb94cdb11
proteore
parents:
12
diff
changeset
|
547 output = open(path, 'w') |
|
098693479a9d
planemo upload commit 64366857c1f29dd79a77dc331eb0153cb94cdb11
proteore
parents:
12
diff
changeset
|
548 writer = csv.writer(output,delimiter="\t") |
|
12
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
549 |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
550 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] |
|
14
3f9ba522bfe8
planemo upload commit 272d182247f4fa55a928ec8cdcb83eab50c7f6ef
proteore
parents:
13
diff
changeset
|
551 writer.writerows(nextprot_file) |
|
3f9ba522bfe8
planemo upload commit 272d182247f4fa55a928ec8cdcb83eab50c7f6ef
proteore
parents:
13
diff
changeset
|
552 |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
553 for id in ids : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
554 query="https://api.nextprot.org/entry/"+id+".json" |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
555 resp = requests.get(url=query) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
556 data = resp.json() |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
557 |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
558 #get info from json dictionary |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
559 mass_mol = data["entry"]["isoforms"][0]["massAsString"] |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
560 seq_length = data['entry']["isoforms"][0]["sequenceLength"] |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
561 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
562 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
563 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
564 |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
565 #put all subcell loc in a set |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
566 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
567 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
568 all_subcell_locs = set() |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
569 for loc in subcell_locs : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
570 all_subcell_locs.add(loc['cvTermName']) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
571 all_subcell_locs.discard("") |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
572 all_subcell_locs = ";".join(all_subcell_locs) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
573 else : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
574 all_subcell_locs = "NA" |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
575 |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
576 #put all subcell loc in a set |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
577 if ('disease') in data['entry']['annotationsByCategory'].keys() : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
578 diseases = data['entry']['annotationsByCategory']['disease'] |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
579 all_diseases = set() |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
580 for disease in diseases : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
581 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
582 all_diseases.add(disease['cvTermName']) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
583 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
584 else : all_diseases="NA" |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
585 else : |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
586 all_diseases="NA" |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
587 |
|
11
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
588 #get all tm domain |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
589 nb_domains = 0 |
|
11
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
590 if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): |
|
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
591 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] |
|
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
592 all_tm_domains = set() |
|
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
593 for tm in tm_domains : |
|
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
594 all_tm_domains.add(tm['cvTermName']) |
|
ac2cd728c40e
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c
proteore
parents:
9
diff
changeset
|
595 nb_domains+=1 |
|
24
6cbb76823b7b
planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents:
23
diff
changeset
|
596 #print "nb domains ++" |
|
6cbb76823b7b
planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents:
23
diff
changeset
|
597 #print (nb_domains) |
|
15
5f39d3ef5fe3
planemo upload commit 45108c8cbeeccab12031bbf6e65f1177222d4e10
proteore
parents:
14
diff
changeset
|
598 nextprot_file[:] = [] |
|
12
f6afaa1f562c
planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents:
11
diff
changeset
|
599 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
600 writer.writerows(nextprot_file) |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
601 |
|
24
6cbb76823b7b
planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents:
23
diff
changeset
|
602 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
|
6cbb76823b7b
planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents:
23
diff
changeset
|
603 |
|
6cbb76823b7b
planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents:
23
diff
changeset
|
604 data_table_entry = dict(id=id, release=release_id, name = name, value = path) |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
605 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
606 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
607 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
608 # Main function |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
609 ####################################################################################################### |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
610 def main(): |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
611 parser = argparse.ArgumentParser() |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
612 parser.add_argument("--hpa", metavar = ("HPA_OPTION")) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
613 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID")) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
614 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
615 parser.add_argument("--interactome", metavar = ("PPI")) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
616 parser.add_argument("--species") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
617 parser.add_argument("--date") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
618 parser.add_argument("-o", "--output") |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
619 parser.add_argument("--database") |
|
31
faeeabb11a4d
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
30
diff
changeset
|
620 parser.add_argument("--tool_data_path") |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
621 args = parser.parse_args() |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
622 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
623 data_manager_dict = {} |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
624 # Extract json file params |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
625 filename = args.output |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
626 params = from_json_string(open(filename).read()) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
627 target_directory = params[ 'output_data' ][0]['extra_files_path'] |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
628 os.mkdir(target_directory) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
629 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
630 ## Download source files from HPA |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
631 try: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
632 hpa = args.hpa |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
633 except NameError: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
634 hpa = None |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
635 if hpa is not None: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
636 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
637 hpa = hpa.split(",") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
638 for hpa_tissue in hpa: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
639 HPA_sources(data_manager_dict, hpa_tissue, target_directory) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
640 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
641 ## Download source file from Peptide Atlas query |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
642 try: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
643 peptide_atlas = args.peptideatlas |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
644 date = args.date |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
645 except NameError: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
646 peptide_atlas = None |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
647 if peptide_atlas is not None: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
648 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
649 peptide_atlas = peptide_atlas.split(",") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
650 for pa_tissue in peptide_atlas: |
|
32
ec1febc6672e
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
31
diff
changeset
|
651 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
652 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
653 ## Download ID_mapping source file from Uniprot |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
654 try: |
|
40
a7bcc9a0a456
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
39
diff
changeset
|
655 id_mapping = args.id_mapping |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
656 except NameError: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
657 id_mapping = None |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
658 if id_mapping is not None: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
659 id_mapping = id_mapping .split(",") |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
660 for species in id_mapping : |
|
33
e681d7de4f5e
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents:
32
diff
changeset
|
661 id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path) |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
662 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
663 ## Download PPI ref files from biogrid/bioplex/humap |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
664 try: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
665 interactome=args.interactome |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
666 if interactome == "biogrid" : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
667 species=args.species |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
668 else : |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
669 species="Human" |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
670 except NameError: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
671 interactome=None |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
672 species=None |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
673 if interactome is not None and species is not None: |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
674 PPI_ref_files(data_manager_dict, species, interactome, target_directory) |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
675 |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
676 ## Build nextprot ref file for add protein features |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
677 try: |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
678 database=args.database |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
679 except NameError: |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
680 database=None |
|
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
681 if database is not None : |
|
8
d5badf9de1b0
planemo upload commit 968cd5b4f78f0a1da86fc3bc29f8159f86e199aa-dirty
proteore
parents:
6
diff
changeset
|
682 Build_nextprot_ref_file(data_manager_dict,target_directory) |
|
6
f281a1eb83d6
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents:
5
diff
changeset
|
683 |
|
0
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
684 #save info to json file |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
685 filename = args.output |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
686 open(filename, 'wb').write(to_json_string(data_manager_dict)) |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
687 |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
688 if __name__ == "__main__": |
|
0a26460d7366
planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff
changeset
|
689 main() |
