comparison data_manager/resource_building.py @ 0:0a26460d7366 draft

planemo upload commit dbc027f59706f5b7d3f9f9319f2652baa50e2df5-dirty
author proteore
date Mon, 04 Mar 2019 05:03:04 -0500
parents
children 0915249b8c4b
comparison
equal deleted inserted replaced
-1:000000000000 0:0a26460d7366
1 # -*- coding: utf-8 -*-
2 """
3 The purpose of this script is to create source files from different databases to be used in other proteore tools
4 """
5
6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile
7 from io import BytesIO
8 from zipfile import ZipFile
9 from galaxy.util.json import from_json_string, to_json_string
10
11 #######################################################################################################
12 # General functions
13 #######################################################################################################
14 def unzip(url, output_file):
15 """
16 Get a zip file content from a link and unzip
17 """
18 content = requests.get(url)
19 zipfile = ZipFile(BytesIO(content.content))
20 output_content = ""
21 output_content += zipfile.open(zipfile.namelist()[0]).read()
22 output = open(output_file, "w")
23 output.write(output_content)
24 output.close()
25
26 def _add_data_table_entry(data_manager_dict, data_table_entry,data_table):
27 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
28 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
29 data_manager_dict['data_tables'][data_table].append(data_table_entry)
30 return data_manager_dict
31
32 #######################################################################################################
33 # 1. Human Protein Atlas
34 # - Normal tissue
35 # - Pathology
36 # - Full Atlas
37 #######################################################################################################
38 def HPA_sources(data_manager_dict, tissue, target_directory):
39 if tissue == "HPA_normal_tissue":
40 tissue_name = "HPA normal tissue"
41 url = "https://www.proteinatlas.org/download/normal_tissue.tsv.zip"
42 elif tissue == "HPA_pathology":
43 tissue_name = "HPA pathology"
44 url = "https://www.proteinatlas.org/download/pathology.tsv.zip"
45 elif tissue == "HPA_full_atlas":
46 tissue_name = "HPA full atlas"
47 url = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
48
49 output_file = tissue +"_"+ time.strftime("%d-%m-%Y") + ".tsv"
50 path = os.path.join(target_directory, output_file)
51 unzip(url, path) #download and save file
52 tissue_name = tissue_name + " " + time.strftime("%d/%m/%Y")
53 tissue_id = tissue_name.replace(" ","_").replace("/","-")
54
55 data_table_entry = dict(id=tissue_id, name = tissue_name, tissue = tissue, value = path)
56 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_protein_atlas")
57
58
59 #######################################################################################################
60 # 2. Peptide Atlas
61 #######################################################################################################
62 def peptide_atlas_sources(data_manager_dict, tissue, date, target_directory):
63 # Define organism_id (here Human) - to be upraded when other organism added to the project
64 organism_id = "2"
65 # Extract sample_category_id and output filename
66 tissue=tissue.split(".")
67 sample_category_id = tissue[0]
68 tissue_name = tissue[1]
69 output_file = tissue_name+"_"+date + ".tsv"
70
71 query="https://db.systemsbiology.net/sbeams/cgi/PeptideAtlas/GetProteins?&atlas_build_id="+ \
72 sample_category_id+"&display_options=ShowAbundances&organism_id="+organism_id+ \
73 "&redundancy_constraint=4&presence_level_constraint=1%2C2&gene_annotation_level_constraint=leaf\
74 &QUERY_NAME=AT_GetProteins&action=QUERY&output_mode=tsv&apply_action=QUERY"
75
76 with requests.Session() as s:
77 download = s.get(query)
78 decoded_content = download.content.decode('utf-8')
79 cr = csv.reader(decoded_content.splitlines(), delimiter='\t')
80
81 uni_dict = build_dictionary(cr)
82
83 #columns of data table peptide_atlas
84 tissue_id = tissue_name+"_"+date
85 name = tissue_id.replace("-","/").replace("_"," ")
86 path = os.path.join(target_directory,output_file)
87
88 with open(path,"w") as out :
89 w = csv.writer(out,delimiter='\t')
90 w.writerow(["Uniprot_AC","nb_obs"])
91 w.writerows(uni_dict.items())
92
93 data_table_entry = dict(id=tissue_id, name=name, value = path, tissue = tissue_name)
94 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_peptide_atlas")
95
96 #function to count the number of observations by uniprot id
97 def build_dictionary (csv) :
98 uni_dict = {}
99 for line in csv :
100 if "-" not in line[0] and check_uniprot_access(line[0]) :
101 if line[0] in uni_dict :
102 uni_dict[line[0]] += int(line[5])
103 else :
104 uni_dict[line[0]] = int(line[5])
105
106 return uni_dict
107
108 #function to check if an id is an uniprot accession number : return True or False-
109 def check_uniprot_access (id) :
110 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
111 if uniprot_pattern.match(id) :
112 return True
113 else :
114 return False
115
116 def check_entrez_geneid (id) :
117 entrez_pattern = re.compile("[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+")
118 if entrez_pattern.match(id) :
119 return True
120 else :
121 return False
122
123 #######################################################################################################
124 # 3. ID mapping file
125 #######################################################################################################
126 import ftplib, gzip
127 csv.field_size_limit(sys.maxsize) # to handle big files
128
129 def id_mapping_sources (data_manager_dict, species, target_directory) :
130
131 human = species == "Human"
132 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
133 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
134
135 #header
136 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
137 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
138
139 #print("header ok")
140
141 #get selected.tab and keep only ids of interest
142 selected_tab_file=species_dict[species]+"_"+files[0]
143 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
144 with gzip.open(tab_path,"rt") as select :
145 tab_reader = csv.reader(select,delimiter="\t")
146 for line in tab_reader :
147 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
148 os.remove(tab_path)
149
150 #print("selected_tab ok")
151
152 """
153 Supplementary ID to get from HUMAN_9606_idmapping.dat :
154 -NextProt,BioGrid,STRING,KEGG
155 """
156
157 #there's more id type for human
158 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file
159 else : ids = ['BioGrid','STRING','KEGG' ]
160 unidict = {}
161
162 #keep only ids of interest in dictionaries
163 dat_file=species_dict[species]+"_"+files[1]
164 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
165 with gzip.open(dat_path,"rt") as dat :
166 dat_reader = csv.reader(dat,delimiter="\t")
167 for line in dat_reader :
168 uniprotID=line[0] #UniProtID as key
169 id_type=line[1] #ID type of corresponding id, key of sub-dictionnary
170 cor_id=line[2] #corresponding id
171 if "-" not in id_type : #we don't keep isoform
172 if id_type in ids and uniprotID in unidict :
173 if id_type in unidict[uniprotID] :
174 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
175 else :
176 unidict[uniprotID].update({ id_type : cor_id })
177 elif id_type in ids :
178 unidict[uniprotID]={id_type : cor_id}
179 os.remove(dat_path)
180
181 #print("dat_file ok")
182
183 #add ids from idmapping.dat to the final tab
184 for line in tab[1:] :
185 uniprotID=line[0]
186 if human :
187 if uniprotID in unidict :
188 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
189 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
190 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
191 access_dictionary(unidict,uniprotID,'KEGG')])
192 else :
193 line.extend(["","","",""])
194 else :
195 if uniprotID in unidict :
196 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
197 access_dictionary(unidict,uniprotID,'KEGG')])
198 else :
199 line.extend(["","",""])
200
201 #print ("tab ok")
202
203 #add missing nextprot ID for human
204 if human :
205 #build next_dict
206 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
207 next_dict = {}
208 for nextid in nextprot_ids :
209 next_dict[nextid.replace("NX_","")] = nextid
210 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
211
212 #add missing nextprot ID
213 for line in tab[1:] :
214 uniprotID=line[0]
215 nextprotID=line[13]
216 if nextprotID == '' and uniprotID in next_dict :
217 line[13]=next_dict[uniprotID]
218
219 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
220 path = os.path.join(target_directory,output_file)
221
222 with open(path,"w") as out :
223 w = csv.writer(out,delimiter='\t')
224 w.writerows(tab)
225
226 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
227 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
228 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
229
230 data_table_entry = dict(id=id, name = name, value = species, path = path)
231 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping")
232
233 def download_from_uniprot_ftp(file,target_directory) :
234 ftp_dir = "pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"
235 path = os.path.join(target_directory, file)
236 ftp = ftplib.FTP("ftp.uniprot.org")
237 ftp.login("anonymous", "anonymous")
238 ftp.cwd(ftp_dir)
239 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
240 ftp.quit()
241 return (path)
242
243 def id_list_from_nextprot_ftp(file,target_directory) :
244 ftp_dir = "pub/current_release/ac_lists/"
245 path = os.path.join(target_directory, file)
246 ftp = ftplib.FTP("ftp.nextprot.org")
247 ftp.login("anonymous", "anonymous")
248 ftp.cwd(ftp_dir)
249 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
250 ftp.quit()
251 with open(path,'r') as nextprot_ids :
252 nextprot_ids = nextprot_ids.read().splitlines()
253 return (nextprot_ids)
254
255 #return '' if there's no value in a dictionary, avoid error
256 def access_dictionary (dico,key1,key2) :
257 if key1 in dico :
258 if key2 in dico[key1] :
259 return (dico[key1][key2])
260 else :
261 return ("")
262 #print (key2,"not in ",dico,"[",key1,"]")
263 else :
264 return ('')
265
266 #if there are several nextprot ID for one uniprotID, return the uniprot like ID
267 def clean_nextprot_id (next_id,uniprotAc) :
268 if len(next_id.split(";")) > 1 :
269 tmp = next_id.split(";")
270 if "NX_"+uniprotAc in tmp :
271 return ("NX_"+uniprotAc)
272 else :
273 return (tmp[1])
274 else :
275 return (next_id)
276
277
278 #######################################################################################################
279 # 4. Build protein interaction maps files
280 #######################################################################################################
281
282 def get_interactant_name(line,dico):
283
284 if line[0] in dico :
285 interactant_A = dico[line[0]]
286 else :
287 interactant_A = "NA"
288
289 if line[1] in dico :
290 interactant_B = dico[line[1]]
291 else :
292 interactant_B = "NA"
293
294 return interactant_A, interactant_B
295
296 def PPI_ref_files(data_manager_dict, species, interactome, target_directory):
297
298 species_dict={'Human':'Homo sapiens',"Mouse":"Mus musculus","Rat":"Rattus norvegicus"}
299
300 ##BioGRID
301 if interactome=="biogrid":
302
303 tab2_link="https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.167/BIOGRID-ORGANISM-3.5.167.tab2.zip"
304
305 #download zip file
306 r = requests.get(tab2_link)
307 with open("BioGRID.zip", "wb") as code:
308 code.write(r.content)
309
310 #unzip files
311 with zipfile.ZipFile("BioGRID.zip", 'r') as zip_ref:
312 if not os.path.exists("tmp_BioGRID"): os.makedirs("tmp_BioGRID")
313 zip_ref.extractall("tmp_BioGRID")
314
315 #import file of interest and build dictionary
316 file_path="tmp_BioGRID/BIOGRID-ORGANISM-"+species_dict[species].replace(" ","_")+"-3.5.167.tab2.txt"
317 with open(file_path,"r") as handle :
318 tab_file = csv.reader(handle,delimiter="\t")
319 dico_network = {}
320 GeneID_index=1
321 network_cols=[1,2,7,8,11,12,14,18,20]
322 for line in tab_file :
323 if line[GeneID_index] not in dico_network:
324 dico_network[line[GeneID_index]]=[[line[i] for i in network_cols]]
325 else:
326 dico_network[line[GeneID_index]].append([line[i] for i in network_cols])
327
328 #delete tmp_BioGRID directory
329 os.remove("BioGRID.zip")
330 shutil.rmtree("tmp_BioGRID", ignore_errors=True)
331
332 #download NCBI2Reactome.txt file and build dictionary
333 with requests.Session() as s:
334 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
335 r.encoding ="utf-8"
336 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
337
338 dico_nodes = {}
339 geneid_index=0
340 pathway_description_index=3
341 species_index=5
342 for line in tab_file :
343 if line[species_index]==species_dict[species]:
344 if line[geneid_index] in dico_nodes :
345 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
346 else :
347 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
348
349 dico={}
350 dico['network']=dico_network
351 dico['nodes']=dico_nodes
352
353 ##Bioplex
354 elif interactome=="bioplex":
355
356 with requests.Session() as s:
357 r = s.get('http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv')
358 r = r.content.decode('utf-8')
359 bioplex = csv.reader(r.splitlines(), delimiter='\t')
360
361 dico_network = {}
362 dico_network["GeneID"]={}
363 network_geneid_cols=[0,1,4,5,8]
364 dico_network["UniProt-AC"]={}
365 network_uniprot_cols=[2,3,4,5,8]
366 dico_GeneID_to_UniProt = {}
367 for line in bioplex :
368 if line[0] not in dico_network["GeneID"]:
369 dico_network["GeneID"][line[0]]=[[line[i] for i in network_geneid_cols]]
370 else :
371 dico_network["GeneID"][line[0]].append([line[i] for i in network_geneid_cols])
372 if line[1] not in dico_network["UniProt-AC"]:
373 dico_network["UniProt-AC"][line[2]]=[[line[i] for i in network_uniprot_cols]]
374 else:
375 dico_network["UniProt-AC"][line[2]].append([line[i] for i in network_uniprot_cols])
376 dico_GeneID_to_UniProt[line[0]]=line[2]
377
378 with requests.Session() as s:
379 r = s.get('https://reactome.org/download/current/UniProt2Reactome.txt')
380 r.encoding ="utf-8"
381 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
382
383 dico_nodes_uniprot = {}
384 uniProt_index=0
385 pathway_description_index=3
386 species_index=5
387 for line in tab_file :
388 if line[species_index]==species_dict[species]:
389 if line[uniProt_index] in dico_nodes_uniprot :
390 dico_nodes_uniprot[line[uniProt_index]].append(line[pathway_description_index])
391 else :
392 dico_nodes_uniprot[line[uniProt_index]] = [line[pathway_description_index]]
393
394 with requests.Session() as s:
395 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
396 r.encoding ="utf-8"
397 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
398
399 dico_nodes_geneid = {}
400 geneid_index=0
401 pathway_description_index=3
402 species_index=5
403 for line in tab_file :
404 if line[species_index]==species_dict[species]:
405 if line[geneid_index] in dico_nodes_geneid :
406 dico_nodes_geneid[line[geneid_index]].append(line[pathway_description_index])
407 else :
408 dico_nodes_geneid[line[geneid_index]] = [line[pathway_description_index]]
409
410 dico={}
411 dico_nodes={}
412 dico_nodes['GeneID']=dico_nodes_geneid
413 dico_nodes['UniProt-AC']=dico_nodes_uniprot
414 dico['network']=dico_network
415 dico['nodes']=dico_nodes
416 dico['convert']=dico_GeneID_to_UniProt
417
418 ##Humap
419 elif interactome=="humap":
420
421 with requests.Session() as s:
422 r = s.get('http://proteincomplexes.org/static/downloads/nodeTable.txt')
423 r = r.content.decode('utf-8')
424 humap_nodes = csv.reader(r.splitlines(), delimiter=',')
425
426 dico_geneid_to_gene_name={}
427 for line in humap_nodes :
428 if check_entrez_geneid(line[4]):
429 if line[4] not in dico_geneid_to_gene_name:
430 dico_geneid_to_gene_name[line[4]]=line[3]
431
432 with requests.Session() as s:
433 r = s.get('http://proteincomplexes.org/static/downloads/pairsWprob.txt')
434 r = r.content.decode('utf-8')
435 humap = csv.reader(r.splitlines(), delimiter='\t')
436
437 dico_network = {}
438 for line in humap :
439 if check_entrez_geneid(line[0]) and check_entrez_geneid(line[1]):
440
441 interactant_A, interactant_B = get_interactant_name(line,dico_geneid_to_gene_name)
442
443 if line[0] not in dico_network:
444 dico_network[line[0]]=[line[:2]+[interactant_A,interactant_B,line[2]]]
445 else :
446 dico_network[line[0]].append(line[:2]+[interactant_A,interactant_B,line[2]])
447
448 with requests.Session() as s:
449 r = s.get('https://www.reactome.org/download/current/NCBI2Reactome.txt')
450 r.encoding ="utf-8"
451 tab_file = csv.reader(r.content.splitlines(), delimiter='\t')
452
453 dico_nodes = {}
454 geneid_index=0
455 pathway_description_index=3
456 species_index=5
457 for line in tab_file :
458 if line[species_index]==species_dict[species]:
459 #Fill dictionary with pathways
460 if line[geneid_index] in dico_nodes :
461 dico_nodes[line[geneid_index]].append(line[pathway_description_index])
462 else :
463 dico_nodes[line[geneid_index]] = [line[pathway_description_index]]
464
465 dico={}
466 dico['network']=dico_network
467 dico['nodes']=dico_nodes
468 dico['gene_name']=dico_geneid_to_gene_name
469
470 #writing output
471 output_file = species+'_'+interactome+'_'+ time.strftime("%d-%m-%Y") + ".json"
472 path = os.path.join(target_directory,output_file)
473 name = species+" ("+species_dict[species]+") "+time.strftime("%d/%m/%Y")
474 id = species+"_"+interactome+"_"+ time.strftime("%d-%m-%Y")
475
476 with open(path, 'w') as handle:
477 json.dump(dico, handle, sort_keys=True)
478
479 data_table_entry = dict(id=id, name = name, species = species, value = path)
480 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
481
482
483 #######################################################################################################
484 # Main function
485 #######################################################################################################
486 def main():
487 parser = argparse.ArgumentParser()
488 parser.add_argument("--hpa", metavar = ("HPA_OPTION"))
489 parser.add_argument("--peptideatlas", metavar=("SAMPLE_CATEGORY_ID"))
490 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
491 parser.add_argument("--interactome", metavar = ("PPI"))
492 parser.add_argument("--species")
493 parser.add_argument("--date")
494 parser.add_argument("-o", "--output")
495 args = parser.parse_args()
496
497 data_manager_dict = {}
498 # Extract json file params
499 filename = args.output
500 params = from_json_string(open(filename).read())
501 target_directory = params[ 'output_data' ][0]['extra_files_path']
502 os.mkdir(target_directory)
503
504 ## Download source files from HPA
505 try:
506 hpa = args.hpa
507 except NameError:
508 hpa = None
509 if hpa is not None:
510 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
511 hpa = hpa.split(",")
512 for hpa_tissue in hpa:
513 HPA_sources(data_manager_dict, hpa_tissue, target_directory)
514
515 ## Download source file from Peptide Atlas query
516 try:
517 peptide_atlas = args.peptideatlas
518 date = args.date
519 except NameError:
520 peptide_atlas = None
521 if peptide_atlas is not None:
522 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
523 peptide_atlas = peptide_atlas.split(",")
524 for pa_tissue in peptide_atlas:
525 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
526
527 ## Download ID_mapping source file from Uniprot
528 try:
529 id_mapping=args.id_mapping
530 except NameError:
531 id_mapping = None
532 if id_mapping is not None:
533 id_mapping = id_mapping .split(",")
534 for species in id_mapping :
535 id_mapping_sources(data_manager_dict, species, target_directory)
536
537 ## Download PPI ref files from biogrid/bioplex/humap
538 try:
539 interactome=args.interactome
540 if interactome == "biogrid" :
541 species=args.species
542 else :
543 species="Human"
544 except NameError:
545 interactome=None
546 species=None
547 if interactome is not None and species is not None:
548 PPI_ref_files(data_manager_dict, species, interactome, target_directory)
549
550 #save info to json file
551 filename = args.output
552 open(filename, 'wb').write(to_json_string(data_manager_dict))
553
554 if __name__ == "__main__":
555 main()