annotate data_manager/resource_building.py @ 62:add6aa698fb0 draft

"planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
author proteore
date Tue, 09 Jun 2020 16:14:33 +0000
parents da9e74d3c40d
children 54089754ba12
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
1 # -*- coding: utf-8 -*-
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
2 """
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
3 The purpose of this script is to create source files from different databases to be used in other proteore tools
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
4 """
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
5
42
f6a6a70712c4 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 41
diff changeset
6 import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile, subprocess
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
7 from io import BytesIO
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
8 from zipfile import ZipFile
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
9 from galaxy.util.json import from_json_string, to_json_string
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
10
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
11 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
12 # General functions
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
13 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
14 def unzip(url, output_file):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
15 """
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
16 Get a zip file content from a link and unzip
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
17 """
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
18 content = requests.get(url)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
19 zipfile = ZipFile(BytesIO(content.content))
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
20 output_content = ""
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
21 output_content += zipfile.open(zipfile.namelist()[0]).read()
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
22 output = open(output_file, "w")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
23 output.write(output_content)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
24 output.close()
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
25
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
26 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
27 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
28 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
29 data_manager_dict['data_tables'][data_table].append(data_table_entry)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
30 return data_manager_dict
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
31
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
32 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
33 # 1. Human Protein Atlas
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
34 # - Normal tissue
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
35 # - Pathology
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
36 # - Full Atlas
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
37 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
38 def HPA_sources(data_manager_dict, tissue, target_directory):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
39 if tissue == "HPA_normal_tissue":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
40 tissue_name = "HPA normal tissue"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
41 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
1
0915249b8c4b planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents: 0
diff changeset
42 table = "proteore_protein_atlas_normal_tissue"
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
43 elif tissue == "HPA_pathology":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
44 tissue_name = "HPA pathology"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
45 url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
1
0915249b8c4b planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents: 0
diff changeset
46 table = "proteore_protein_atlas_tumor_tissue"
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
47 elif tissue == "HPA_full_atlas":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
48 tissue_name = "HPA full atlas"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
49 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
1
0915249b8c4b planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents: 0
diff changeset
50 table = "proteore_protein_full_atlas"
48
09d9fd9b2d8c "planemo upload commit 714703437f238e7295980f96620365b801fd435c-dirty"
proteore
parents: 45
diff changeset
51 elif tissue == "HPA_RNA_tissue":
09d9fd9b2d8c "planemo upload commit 714703437f238e7295980f96620365b801fd435c-dirty"
proteore
parents: 45
diff changeset
52 tissue_name = "HPA RNA tissue"
50
7a6d8aafb269 "planemo upload commit 20b2ab253fe8386e248493316760a3fb5ccc309d-dirty"
proteore
parents: 49
diff changeset
53 url = "https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip"
48
09d9fd9b2d8c "planemo upload commit 714703437f238e7295980f96620365b801fd435c-dirty"
proteore
parents: 45
diff changeset
54 table = "proteore_protein_atlas_rna_tissue"
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
55
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
56 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
57 path = os.path.join(target_directory, output_file)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
58 unzip(url, path) #download and save file
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
59 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
26
bf6940ff60a8 planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents: 24
diff changeset
60 release = tissue_name.replace(" ","_").replace("/","-")
bf6940ff60a8 planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents: 24
diff changeset
61 id = str(10000000000 - int(time.strftime("%Y%m%d")))
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
62
1
0915249b8c4b planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents: 0
diff changeset
63
26
bf6940ff60a8 planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents: 24
diff changeset
64 data_table_entry = dict(id=id, release=release, name = tissue_name, tissue = tissue, value = path)
1
0915249b8c4b planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents: 0
diff changeset
65 _add_data_table_entry(data_manager_dict, data_table_entry, table)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
66
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
67
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
68 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
69 # 2. Peptide Atlas
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
70 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
71 def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
72 # Define organism_id (here Human) - to be upraded when other organism added to the project
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
73 organism_id = "2"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
74 # Extract sample_category_id and output filename
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
75 tissue=tissue.split(".")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
76 sample_category_id = tissue[0]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
77 tissue_name = tissue[1]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
78 output_file = tissue_name+"_"+date + ".tsv"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
79
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
80 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
81 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
82 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
83 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
84
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
85 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
86 download = s.get(query)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
87 decoded_content = download.content.decode('utf-8')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
88 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
89
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
90 uni_dict = build_dictionary(cr)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
91
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
92 #columns of data table peptide_atlas
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
93 tissue_id = tissue_name+"_"+date
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
94 name = tissue_id.replace("-","/").replace("_"," ")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
95 path = os.path.join(target_directory,output_file)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
96
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
97 with open(path,"w") as out :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
98 w = csv.writer(out,delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
99 w.writerow(["Uniprot_AC","nb_obs"])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
100 w.writerows(uni_dict.items())
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
101
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
102 data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
103 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
104
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
105 #function to count the number of observations by uniprot id
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
106 def build_dictionary (csv) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
107 uni_dict = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
108 for line in csv :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
109 if "-" not in line[0] and check_uniprot_access(line[0]) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
110 if line[0] in uni_dict :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
111 uni_dict[line[0]] += int(line[5])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
112 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
113 uni_dict[line[0]] = int(line[5])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
114
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
115 return uni_dict
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
116
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
117 #function to check if an id is an uniprot accession number : return True or False-
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
118 def check_uniprot_access (id) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
119 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
120 if uniprot_pattern.match(id) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
121 return True
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
122 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
123 return False
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
124
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
125 def check_entrez_geneid (id) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
126 entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
127 if entrez_pattern.match(id) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
128 return True
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
129 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
130 return False
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
131
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
132 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
133 # 3. ID mapping file
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
134 #######################################################################################################
51
55b12ec24a9f "planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents: 50
diff changeset
135 import ftplib, gzip
55b12ec24a9f "planemo upload commit 59b014e9f6e2d668cbd7c4844b10db3d59baefd8-dirty"
proteore
parents: 50
diff changeset
136 from io import StringIO
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
137 csv.field_size_limit(sys.maxsize) # to handle big files
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
138
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
139 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
140
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
141 human = species == "Human"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
142 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
143 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
36
b18cb0d11971 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 35
diff changeset
144 archive = os.path.join(tool_data_path, "id_mapping/ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))
38
6daab0a711e2 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 37
diff changeset
145 if os.path.isdir(archive) is False : os.mkdir(archive)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
146
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
147 #header
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
148 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
149 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
150
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
151 #get selected.tab and keep only ids of interest
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
152 selected_tab_file=species_dict[species]+"_"+files[0]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
153 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
154 with gzip.open(tab_path,"rt") as select :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
155 tab_reader = csv.reader(select,delimiter="\t")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
156 for line in tab_reader :
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
157 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
41
fe21769281fa "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 40
diff changeset
158 if os.path.exists(os.path.join(archive,tab_path.split("/")[-1])) : os.remove(os.path.join(archive,tab_path.split("/")[-1]))
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
159 shutil.move(tab_path, archive)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
160 #print("selected_tab ok")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
161
20
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
162 #get uniprot-AC reviewed
22
4296fb613d9d planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 21
diff changeset
163 organism = species_dict[species].split("_")[1]
4296fb613d9d planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 21
diff changeset
164 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
20
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
165
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
166 with requests.Session() as s:
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
167 download = s.get(query)
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
168 decoded_content = download.content.decode('utf-8')
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
169 uniprot_reviewed_list = decoded_content.splitlines()
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
170
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
171 #save reviewed list
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
172 reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt')
34
06b3181206b4 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 33
diff changeset
173 with open(reviewed_list_path,'w') as reviewed_list_file:
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
174 for id in uniprot_reviewed_list:
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
175 reviewed_list_file.write(id+"\n")
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
176
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
177 #remove unreviewed uniprot-AC
20
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
178 for line in tab[1:]:
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
179 UniProtAC = line[1]
28
808c8493ed4f planemo upload commit 5c2c274361c0daceae1f678eca0e6c0d5b4ba4f7-dirty
proteore
parents: 27
diff changeset
180 if UniProtAC not in uniprot_reviewed_list :
29
9a40b72414de planemo upload commit 3c02acc07df862410ce979ce63e3d14f10edc50f-dirty
proteore
parents: 28
diff changeset
181 line[1]=""
20
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
182
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
183 """
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
184 Supplementary ID to get from HUMAN_9606_idmapping.dat :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
185 -NextProt,BioGrid,STRING,KEGG
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
186 """
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
187
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
188 #there's more id type for human
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
189 if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
190 else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ]
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
191 unidict = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
192
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
193 #keep only ids of interest in dictionaries
40
a7bcc9a0a456 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 39
diff changeset
194 dat_file = species_dict[species]+"_"+files[1]
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
195 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
196 with gzip.open(dat_path,"rt") as dat :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
197 dat_reader = csv.reader(dat,delimiter="\t")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
198 for line in dat_reader :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
199 uniprotID=line[0] #UniProtID as key
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
200 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
201 cor_id=line[2] #corresponding id
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
202 if "-" not in id_type : #we don't keep isoform
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
203 if id_type in ids and uniprotID in unidict :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
204 if id_type in unidict[uniprotID] :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
205 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
206 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
207 unidict[uniprotID].update({ id_type : cor_id })
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
208 elif id_type in ids :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
209 unidict[uniprotID]={id_type : cor_id}
41
fe21769281fa "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 40
diff changeset
210 if os.path.exists(os.path.join(archive,dat_path.split("/")[-1])) : os.remove(os.path.join(archive,dat_path.split("/")[-1]))
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
211 shutil.move(dat_path, archive)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
212
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
213 #print("dat_file ok")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
214
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
215 #add ids from idmapping.dat to the final tab
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
216 for line in tab[1:] :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
217 uniprotID=line[0]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
218 if human :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
219 if uniprotID in unidict :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
220 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
221 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
222 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
223 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
224 else :
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
225 line.extend(["","","","",""])
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
226 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
227 if uniprotID in unidict :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
228 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
229 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
230 else :
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
231 line.extend(["","","",""])
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
232
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
233 #print ("tab ok")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
234
20
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
235 #add missing nextprot ID for human or replace old ones
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
236 if human :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
237 #build next_dict
54
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
238 nextprot_path = download_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
239 with open(nextprot_path,'r') as nextprot_ids :
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
240 nextprot_ids = nextprot_ids.read().splitlines()
41
fe21769281fa "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 40
diff changeset
241 if os.path.exists(os.path.join(archive,nextprot_path.split("/")[-1])) : os.remove(os.path.join(archive,nextprot_path.split("/")[-1]))
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
242 shutil.move(nextprot_path,archive)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
243 next_dict = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
244 for nextid in nextprot_ids :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
245 next_dict[nextid.replace("NX_","")] = nextid
45
488e015576bc "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 44
diff changeset
246 #os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
247
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
248 #add missing nextprot ID
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
249 for line in tab[1:] :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
250 uniprotID=line[0]
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
251 nextprotID=line[14]
20
29cf75c83618 planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 16
diff changeset
252 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
30
a6cabd3ab71f "planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
proteore
parents: 29
diff changeset
253 line[14]=next_dict[uniprotID]
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
254
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
255 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
256 path = os.path.join(target_directory,output_file)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
257
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
258 with open(path,"w") as out :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
259 w = csv.writer(out,delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
260 w.writerows(tab)
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
261
36
b18cb0d11971 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 35
diff changeset
262 subprocess.call(['tar', '-czvf', archive+".tar.gz", archive])
44
7ebbf851598e "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 43
diff changeset
263 shutil.rmtree(archive, ignore_errors=True)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
264
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
265 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
266 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
21
026177e4ff4b planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 20
diff changeset
267 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
23
ebdd4961c6c2 planemo upload commit 03015f58ac7e7ba3cc44ba0d8899eacfaaf5a134-dirty
proteore
parents: 22
diff changeset
268 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
269
21
026177e4ff4b planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
proteore
parents: 20
diff changeset
270 data_table_entry = dict(id=id, release=release , name = name, species = species, value = path)
2
2e34ee6d2d37 planemo upload commit 4dd1a2f7d196a1d2e70fab379a2c08367da0fe94-dirty
proteore
parents: 1
diff changeset
271 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
272
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
273 def download_from_uniprot_ftp(file,target_directory) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
274 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
275 path = os.path.join(target_directory, file)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
276 ftp = ftplib.FTP("ftp.uniprot.org")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
277 ftp.login("anonymous", "anonymous")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
278 ftp.cwd(ftp_dir)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
279 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
280 ftp.quit()
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
281 return (path)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
282
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
283 def download_from_nextprot_ftp(file,target_directory) :
54
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
284 ftp_dir = "pub/current_release/ac_lists/"
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
285 path = os.path.join(target_directory, file)
55
9fda95925297 "planemo upload commit ba5a199cfc995bbac148163698af2bbcef9cc40b-dirty"
proteore
parents: 54
diff changeset
286 ftp = ftplib.FTP("ftp.nextprot.org")
54
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
287 ftp.login("anonymous", "anonymous")
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
288 ftp.cwd(ftp_dir)
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
289 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
290 ftp.quit()
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
291 return (path)
54
109fc5236204 "planemo upload commit ebdd6549d01d60be6f07abca06f6ce4e2b6beda1"
proteore
parents: 53
diff changeset
292
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
293 def id_list_from_nextprot_ftp(file,target_directory) :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
294 ftp_dir = "pub/current_release/ac_lists/"
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
295 path = os.path.join(target_directory, file)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
296 ftp = ftplib.FTP("ftp.nextprot.org")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
297 ftp.login("anonymous", "anonymous")
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
298 ftp.cwd(ftp_dir)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
299 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
300 ftp.quit()
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
301 with open(path,'r') as nextprot_ids :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
302 nextprot_ids = nextprot_ids.read().splitlines()
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
303 return (nextprot_ids)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
304
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
305 #return '' if there's no value in a dictionary, avoid error
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
306 def access_dictionary (dico,key1,key2) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
307 if key1 in dico :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
308 if key2 in dico[key1] :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
309 return (dico[key1][key2])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
310 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
311 return ("")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
312 #print (key2,"not in ",dico,"[",key1,"]")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
313 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
314 return ('')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
315
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
316 #if there are several nextprot ID for one uniprotID, return the uniprot like ID
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
317 def clean_nextprot_id (next_id,uniprotAc) :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
318 if len(next_id.split(";")) > 1 :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
319 tmp = next_id.split(";")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
320 if "NX_"+uniprotAc in tmp :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
321 return ("NX_"+uniprotAc)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
322 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
323 return (tmp[1])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
324 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
325 return (next_id)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
326
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
327
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
328 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
329 # 4. Build protein interaction maps files
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
330 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
331
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
332 def get_interactant_name(line,dico):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
333
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
334 if line[0] in dico :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
335 interactant_A = dico[line[0]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
336 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
337 interactant_A = "NA"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
338
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
339 if line[1] in dico :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
340 interactant_B = dico[line[1]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
341 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
342 interactant_B = "NA"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
343
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
344 return interactant_A, interactant_B
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
345
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
346 def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
347
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
348 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
349
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
350 ##BioGRID
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
351 if interactome=="biogrid":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
352
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
353 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
354
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
355 #download zip file
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
356 r = requests.get(tab2_link)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
357 with open("BioGRID.zip", "wb") as code:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
358 code.write(r.content)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
359
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
360 #unzip files
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
361 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
362 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
363 zip_ref.extractall("tmp_BioGRID")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
364
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
365 #import file of interest and build dictionary
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
366 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
367 with open(file_path,"r") as handle :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
368 tab_file = csv.reader(handle,delimiter="\t")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
369 dico_network = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
370 GeneID_index=1
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
371 network_cols=[1,2,7,8,11,12,14,18,20]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
372 for line in tab_file :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
373 if line[GeneID_index] not in dico_network:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
374 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
375 else:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
376 dico_network[line[GeneID_index]].append([line[i] for i in network_cols])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
377
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
378 #delete tmp_BioGRID directory
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
379 os.remove("BioGRID.zip")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
380 shutil.rmtree("tmp_BioGRID", ignore_errors=True)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
381
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
382 #download NCBI2Reactome.txt file and build dictionary
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
383 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
384 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
385 r.encoding ="utf-8"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
386 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
387
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
388 dico_nodes = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
389 geneid_index=0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
390 pathway_description_index=3
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
391 species_index=5
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
392 for line in tab_file :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
393 if line[species_index]==species_dict[species]:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
394 if line[geneid_index] in dico_nodes :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
395 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
396 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
397 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
398
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
399 dico={}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
400 dico['network']=dico_network
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
401 dico['nodes']=dico_nodes
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
402
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
403 ##Bioplex
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
404 elif interactome=="bioplex":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
405
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
406 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
407 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
408 r = r.content.decode('utf-8')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
409 bioplex = csv.reader(r.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
410
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
411 dico_network = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
412 dico_network["GeneID"]={}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
413 network_geneid_cols=[0,1,4,5,8]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
414 dico_network["UniProt-AC"]={}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
415 network_uniprot_cols=[2,3,4,5,8]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
416 dico_GeneID_to_UniProt = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
417 for line in bioplex :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
418 if line[0] not in dico_network["GeneID"]:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
419 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
420 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
421 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
422 if line[1] not in dico_network["UniProt-AC"]:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
423 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
424 else:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
425 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
426 dico_GeneID_to_UniProt[line[0]]=line[2]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
427
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
428 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
429 r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
430 r.encoding ="utf-8"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
431 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
432
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
433 dico_nodes_uniprot = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
434 uniProt_index=0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
435 pathway_description_index=3
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
436 species_index=5
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
437 for line in tab_file :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
438 if line[species_index]==species_dict[species]:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
439 if line[uniProt_index] in dico_nodes_uniprot :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
440 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
441 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
442 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
443
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
444 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
445 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
446 r.encoding ="utf-8"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
447 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
448
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
449 dico_nodes_geneid = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
450 geneid_index=0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
451 pathway_description_index=3
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
452 species_index=5
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
453 for line in tab_file :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
454 if line[species_index]==species_dict[species]:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
455 if line[geneid_index] in dico_nodes_geneid :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
456 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
457 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
458 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
459
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
460 dico={}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
461 dico_nodes={}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
462 dico_nodes['GeneID']=dico_nodes_geneid
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
463 dico_nodes['UniProt-AC']=dico_nodes_uniprot
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
464 dico['network']=dico_network
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
465 dico['nodes']=dico_nodes
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
466 dico['convert']=dico_GeneID_to_UniProt
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
467
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
468 ##Humap
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
469 elif interactome=="humap":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
470
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
471 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
472 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
473 r = r.content.decode('utf-8')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
474 humap_nodes = csv.reader(r.splitlines(), delimiter=',')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
475
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
476 dico_geneid_to_gene_name={}
9
cdd29444e0af planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents: 8
diff changeset
477 dico_protein_name={}
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
478 for line in humap_nodes :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
479 if check_entrez_geneid(line[4]):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
480 if line[4] not in dico_geneid_to_gene_name:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
481 dico_geneid_to_gene_name[line[4]]=line[3]
9
cdd29444e0af planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents: 8
diff changeset
482 if line[4] not in dico_protein_name:
cdd29444e0af planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents: 8
diff changeset
483 dico_protein_name[line[4]]=line[5]
cdd29444e0af planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents: 8
diff changeset
484
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
485 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
486 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
487 r = r.content.decode('utf-8')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
488 humap = csv.reader(r.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
489
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
490 dico_network = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
491 for line in humap :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
492 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]):
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
493
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
494 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
495
4
ce01295cd601 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 3
diff changeset
496 #first interactant (first column)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
497 if line[0] not in dico_network:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
498 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
499 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
500 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
501
4
ce01295cd601 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 3
diff changeset
502 #second interactant (second column)
ce01295cd601 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 3
diff changeset
503 if line[1] not in dico_network:
5
429e7481c392 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 4
diff changeset
504 dico_network[line[1]]=[[line[1],line[0],interactant_B,interactant_A,line[2]]]
4
ce01295cd601 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 3
diff changeset
505 else :
5
429e7481c392 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 4
diff changeset
506 dico_network[line[1]].append([line[1],line[0],interactant_B,interactant_A,line[2]])
4
ce01295cd601 planemo upload commit 51fc514a85c1055cab5bb6e76c90f3da7e648101-dirty
proteore
parents: 3
diff changeset
507
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
508 with requests.Session() as s:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
509 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
510 r.encoding ="utf-8"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
511 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
512
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
513 dico_nodes = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
514 geneid_index=0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
515 pathway_description_index=3
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
516 species_index=5
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
517 for line in tab_file :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
518 if line[species_index]==species_dict[species]:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
519 #Fill dictionary with pathways
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
520 if line[geneid_index] in dico_nodes :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
521 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
522 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
523 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
524
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
525 dico={}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
526 dico['network']=dico_network
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
527 dico['nodes']=dico_nodes
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
528 dico['gene_name']=dico_geneid_to_gene_name
9
cdd29444e0af planemo upload commit 71363136045353f422ff98219c1eb84f6fc6193a-dirty
proteore
parents: 8
diff changeset
529 dico['protein_name']=dico_protein_name
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
530
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
531 #writing output
27
9a400ce8e4e6 planemo upload commit bb113d19d3a756f70784e6a1433902888686ed96-dirty
proteore
parents: 26
diff changeset
532 output_file = species+'_'+interactome+'_'+ time.strftime("%Y-%m-%d") + ".json"
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
533 path = os.path.join(target_directory,output_file)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
534 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
27
9a400ce8e4e6 planemo upload commit bb113d19d3a756f70784e6a1433902888686ed96-dirty
proteore
parents: 26
diff changeset
535 release = species+"_"+interactome+"_"+ time.strftime("%Y-%m-%d")
26
bf6940ff60a8 planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents: 24
diff changeset
536 id = str(10000000000 - int(time.strftime("%Y%m%d")))
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
537
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
538 with open(path, 'w') as handle:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
539 json.dump(dico, handle, sort_keys=True)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
540
26
bf6940ff60a8 planemo upload commit 0290724216a2c445b4e28842153b84a1b28f4e9a-dirty
proteore
parents: 24
diff changeset
541 data_table_entry = dict(id=id, release=release, name = name, species = species, value = path)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
542 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
543
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
544 #######################################################################################################
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
545 # 5. nextprot (add protein features)
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
546 #######################################################################################################
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
547
60
da9e74d3c40d "planemo upload commit a69fdb4cdc110c75fea7439f0d97b67158c1bbbf"
proteore
parents: 57
diff changeset
548 def Build_nextprot_ref_file(data_manager_dict,target_directory):
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
549 nextprot_ids_file = "nextprot_ac_list_all.txt"
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
550 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
12
f6afaa1f562c planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents: 11
diff changeset
551
f6afaa1f562c planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents: 11
diff changeset
552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
f6afaa1f562c planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents: 11
diff changeset
553 path = os.path.join(target_directory,output_file)
f6afaa1f562c planemo upload commit 39a9e2bf22b07beeca3fb77d86cda25820eb309c-dirty
proteore
parents: 11
diff changeset
554 name = "neXtProt release "+time.strftime("%d-%m-%Y")
16
f75c525e0a4a planemo upload commit 04868e380c43447fac1309fc292785d67863a87b-dirty
proteore
parents: 15
diff changeset
555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
556
13
098693479a9d planemo upload commit 64366857c1f29dd79a77dc331eb0153cb94cdb11
proteore
parents: 12
diff changeset
557 output = open(path, 'w')
098693479a9d planemo upload commit 64366857c1f29dd79a77dc331eb0153cb94cdb11
proteore
parents: 12
diff changeset
558 writer = csv.writer(output,delimiter="\t")
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
559
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
560 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
561 writer.writerows(nextprot_file)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
562
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
563 for id in ids :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
564 query="https://api.nextprot.org/entry/"+id+".json"
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
565 try:
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
566 resp = requests.get(url=query)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
567 except :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
568 print ("wainting 1 hour before trying again")
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
569 time.sleep(3600)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
570 resp = requests.get(url=query)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
571 data = resp.json()
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
572
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
573 #get info from json dictionary
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
574 mass_mol = data["entry"]["isoforms"][0]["massAsString"]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
575 seq_length = data['entry']["isoforms"][0]["sequenceLength"]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
579
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
580 #put all subcell loc in a set
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
581 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
582 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
583 all_subcell_locs = set()
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
584 for loc in subcell_locs :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
585 all_subcell_locs.add(loc['cvTermName'])
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
586 all_subcell_locs.discard("")
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
587 all_subcell_locs = ";".join(all_subcell_locs)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
588 else :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
589 all_subcell_locs = "NA"
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
590
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
591 #put all subcell loc in a set
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
592 if ('disease') in data['entry']['annotationsByCategory'].keys() :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
593 diseases = data['entry']['annotationsByCategory']['disease']
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
594 all_diseases = set()
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
595 for disease in diseases :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
596 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
597 all_diseases.add(disease['cvTermName'])
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
598 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
599 else : all_diseases="NA"
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
600 else :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
601 all_diseases="NA"
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
602
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
603 #get all tm domain
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
604 nb_domains = 0
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
605 if "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
606 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
607 all_tm_domains = set()
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
608 for tm in tm_domains :
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
609 all_tm_domains.add(tm['cvTermName'])
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
610 nb_domains+=1
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
611 #print "nb domains ++"
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
612 #print (nb_domains)
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
613 nextprot_file[:] = []
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
614 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
615 writer.writerows(nextprot_file)
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
616
62
add6aa698fb0 "planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
proteore
parents: 60
diff changeset
617 id = str(10000000000 - int(time.strftime("%Y%m%d")))
24
6cbb76823b7b planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents: 23
diff changeset
618
6cbb76823b7b planemo upload commit 3e8aad84b7fe9218370364b1aef3ea5779475f9f-dirty
proteore
parents: 23
diff changeset
619 data_table_entry = dict(id=id, release=release_id, name = name, value = path)
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
620 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
621
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
622 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
623 # Main function
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
624 #######################################################################################################
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
625 def main():
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
626 parser = argparse.ArgumentParser()
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
627 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
628 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
629 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
630 parser.add_argument("--interactome", metavar = ("PPI"))
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
631 parser.add_argument("--species")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
632 parser.add_argument("--date")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
633 parser.add_argument("-o", "--output")
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
634 parser.add_argument("--database")
31
faeeabb11a4d "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 30
diff changeset
635 parser.add_argument("--tool_data_path")
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
636 args = parser.parse_args()
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
637
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
638 data_manager_dict = {}
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
639 # Extract json file params
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
640 filename = args.output
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
641 params = from_json_string(open(filename).read())
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
642 target_directory = params[ 'output_data' ][0]['extra_files_path']
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
643 os.mkdir(target_directory)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
644
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
645 ## Download source files from HPA
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
646 try:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
647 hpa = args.hpa
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
648 except NameError:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
649 hpa = None
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
650 if hpa is not None:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
651 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
652 hpa = hpa.split(",")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
653 for hpa_tissue in hpa:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
654 HPA_sources(data_manager_dict, hpa_tissue, target_directory)
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
655
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
656 ## Download source file from Peptide Atlas query
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
657 try:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
658 peptide_atlas = args.peptideatlas
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
659 date = args.date
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
660 except NameError:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
661 peptide_atlas = None
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
662 if peptide_atlas is not None:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
663 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
664 peptide_atlas = peptide_atlas.split(",")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
665 for pa_tissue in peptide_atlas:
32
ec1febc6672e "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 31
diff changeset
666 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
667
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
668 ## Download ID_mapping source file from Uniprot
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
669 try:
40
a7bcc9a0a456 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 39
diff changeset
670 id_mapping = args.id_mapping
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
671 except NameError:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
672 id_mapping = None
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
673 if id_mapping is not None:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
674 id_mapping = id_mapping .split(",")
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
675 for species in id_mapping :
33
e681d7de4f5e "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
proteore
parents: 32
diff changeset
676 id_mapping_sources(data_manager_dict, species, target_directory, args.tool_data_path)
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
677
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
678 ## Download PPI ref files from biogrid/bioplex/humap
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
679 try:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
680 interactome=args.interactome
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
681 if interactome == "biogrid" :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
682 species=args.species
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
683 else :
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
684 species="Human"
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
685 except NameError:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
686 interactome=None
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
687 species=None
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
688 if interactome is not None and species is not None:
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
689 PPI_ref_files(data_manager_dict, species, interactome, target_directory)
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
690
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
691 ## Build nextprot ref file for add protein features
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
692 try:
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
693 database=args.database
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
694 except NameError:
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
695 database=None
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
696 if database is not None :
8
d5badf9de1b0 planemo upload commit 968cd5b4f78f0a1da86fc3bc29f8159f86e199aa-dirty
proteore
parents: 6
diff changeset
697 Build_nextprot_ref_file(data_manager_dict,target_directory)
6
f281a1eb83d6 planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
proteore
parents: 5
diff changeset
698
0
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
699 #save info to json file
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
700 filename = args.output
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
701 open(filename, 'wb').write(to_json_string(data_manager_dict))
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
702
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
703 if __name__ == "__main__":
0a26460d7366 planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
proteore
parents:
diff changeset
704 main()