comparison data_manager/resource_building.py @ 62:add6aa698fb0 draft

"planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
author proteore
date Tue, 09 Jun 2020 16:14:33 +0000
parents da9e74d3c40d
children 54089754ba12
comparison
equal deleted inserted replaced
61:5504538d24f6 62:add6aa698fb0
278 ftp.cwd(ftp_dir) 278 ftp.cwd(ftp_dir)
279 ftp.retrbinary("RETR " + file, open(path, 'wb').write) 279 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
280 ftp.quit() 280 ftp.quit()
281 return (path) 281 return (path)
282 282
283 def download_from_nextprot_ftp(file,target_directory): 283 def download_from_nextprot_ftp(file,target_directory) :
284 ftp_dir = "pub/current_release/ac_lists/" 284 ftp_dir = "pub/current_release/ac_lists/"
285 path = os.path.join(target_directory, file) 285 path = os.path.join(target_directory, file)
286 ftp = ftplib.FTP("ftp.nextprot.org") 286 ftp = ftplib.FTP("ftp.nextprot.org")
287 ftp.login("anonymous", "anonymous") 287 ftp.login("anonymous", "anonymous")
288 ftp.cwd(ftp_dir) 288 ftp.cwd(ftp_dir)
289 ftp.retrbinary("RETR " + file, open(path, 'wb').write) 289 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
290 ftp.quit() 290 ftp.quit()
291 return path 291 return (path)
292 292
293 def id_list_from_nextprot_ftp(file) : 293 def id_list_from_nextprot_ftp(file,target_directory) :
294 ftp_dir = "pub/current_release/ac_lists/"
295 path = os.path.join(target_directory, file)
294 ftp = ftplib.FTP("ftp.nextprot.org") 296 ftp = ftplib.FTP("ftp.nextprot.org")
295 ftp.login("anonymous", "anonymous") 297 ftp.login("anonymous", "anonymous")
296 r = StringIO() 298 ftp.cwd(ftp_dir)
297 ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n')) 299 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
298 ftp.quit() 300 ftp.quit()
299 r.seek(0) 301 with open(path,'r') as nextprot_ids :
300 ids = r.readlines() 302 nextprot_ids = nextprot_ids.read().splitlines()
301 ids = [id.strip('\n') for id in ids] 303 return (nextprot_ids)
302
303 return (ids)
304 304
305 #return '' if there's no value in a dictionary, avoid error 305 #return '' if there's no value in a dictionary, avoid error
306 def access_dictionary (dico,key1,key2) : 306 def access_dictionary (dico,key1,key2) :
307 if key1 in dico : 307 if key1 in dico :
308 if key2 in dico[key1] : 308 if key2 in dico[key1] :
542 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") 542 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
543 543
544 ####################################################################################################### 544 #######################################################################################################
545 # 5. nextprot (add protein features) 545 # 5. nextprot (add protein features)
546 ####################################################################################################### 546 #######################################################################################################
547
547 def Build_nextprot_ref_file(data_manager_dict,target_directory): 548 def Build_nextprot_ref_file(data_manager_dict,target_directory):
548 549 nextprot_ids_file = "nextprot_ac_list_all.txt"
549 from requests_futures.sessions import FuturesSession 550 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
550 from concurrent.futures import ProcessPoolExecutor
551
552 #Get nextprot ids list
553 ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt")
554 551
555 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" 552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
556 path = os.path.join(target_directory,output_file) 553 path = os.path.join(target_directory,output_file)
557 name = "neXtProt release "+time.strftime("%d-%m-%Y") 554 name = "neXtProt release "+time.strftime("%d-%m-%Y")
558 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") 555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
559 556
560 #open output file to write
561 output = open(path, 'w') 557 output = open(path, 'w')
562 writer = csv.writer(output,delimiter="\t") 558 writer = csv.writer(output,delimiter="\t")
563 writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]) 559
564 560 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
565 subset=100 561 writer.writerows(nextprot_file)
566 ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)] 562
567 563 for id in ids :
568 for ids_subset in ids_subsets: 564 query="https://api.nextprot.org/entry/"+id+".json"
569 565 try:
570 #Open concurent sessions 566 resp = requests.get(url=query)
571 with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session: 567 except :
572 futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset] 568 print ("wainting 1 hour before trying again")
573 569 time.sleep(3600)
574 for id,future in zip(ids_subset,futures) : 570 resp = requests.get(url=query)
575 571 data = resp.json()
576 #Get json dictionary 572
577 try: 573 #get info from json dictionary
578 res = future.result() 574 mass_mol = data["entry"]["isoforms"][0]["massAsString"]
579 except: 575 seq_length = data['entry']["isoforms"][0]["sequenceLength"]
580 print ("sleep 1 hour") 576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
581 time.sleep(3600) 577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
582 res = future.result() 578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
583 data = res.json() 579
584 580 #put all subcell loc in a set
585 #get info from json dictionary 581 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
586 mass_mol = data["entry"]["isoforms"][0]["massAsString"] 582 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
587 seq_length = data['entry']["isoforms"][0]["sequenceLength"] 583 all_subcell_locs = set()
588 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] 584 for loc in subcell_locs :
589 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] 585 all_subcell_locs.add(loc['cvTermName'])
590 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) 586 all_subcell_locs.discard("")
591 587 all_subcell_locs = ";".join(all_subcell_locs)
592 #put all subcell loc in a set 588 else :
593 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : 589 all_subcell_locs = "NA"
594 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] 590
595 all_subcell_locs = set() 591 #put all subcell loc in a set
596 for loc in subcell_locs : 592 if ('disease') in data['entry']['annotationsByCategory'].keys() :
597 all_subcell_locs.add(loc['cvTermName']) 593 diseases = data['entry']['annotationsByCategory']['disease']
598 all_subcell_locs.discard("") 594 all_diseases = set()
599 all_subcell_locs = ";".join(all_subcell_locs) 595 for disease in diseases :
600 else : 596 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
601 all_subcell_locs = "NA" 597 all_diseases.add(disease['cvTermName'])
602 598 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
603 #put all subcell loc in a set 599 else : all_diseases="NA"
604 if ('disease') in data['entry']['annotationsByCategory'].keys() : 600 else :
605 diseases = data['entry']['annotationsByCategory']['disease'] 601 all_diseases="NA"
606 all_diseases = set() 602
607 for disease in diseases : 603 #get all tm domain
608 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): 604 nb_domains = 0
609 all_diseases.add(disease['cvTermName']) 605 if "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
610 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) 606 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
611 else : all_diseases="NA" 607 all_tm_domains = set()
612 else : 608 for tm in tm_domains :
613 all_diseases="NA" 609 all_tm_domains.add(tm['cvTermName'])
614 610 nb_domains+=1
615 #get all tm domain 611 #print "nb domains ++"
616 nb_domains = 0 612 #print (nb_domains)
617 if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): 613 nextprot_file[:] = []
618 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] 614 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
619 all_tm_domains = set() 615 writer.writerows(nextprot_file)
620 for tm in tm_domains : 616
621 all_tm_domains.add(tm['cvTermName']) 617 id = str(10000000000 - int(time.strftime("%Y%m%d")))
622 nb_domains+=1
623
624 writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
625
626 id = str(10000000000 - int(time.strftime("%Y%m%d")))
627 618
628 data_table_entry = dict(id=id, release=release_id, name = name, value = path) 619 data_table_entry = dict(id=id, release=release_id, name = name, value = path)
629 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") 620 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
630 621
631 ####################################################################################################### 622 #######################################################################################################