Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 62:add6aa698fb0 draft
"planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
| author | proteore |
|---|---|
| date | Tue, 09 Jun 2020 16:14:33 +0000 |
| parents | da9e74d3c40d |
| children | 54089754ba12 |
comparison
equal
deleted
inserted
replaced
| 61:5504538d24f6 | 62:add6aa698fb0 |
|---|---|
| 278 ftp.cwd(ftp_dir) | 278 ftp.cwd(ftp_dir) |
| 279 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | 279 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
| 280 ftp.quit() | 280 ftp.quit() |
| 281 return (path) | 281 return (path) |
| 282 | 282 |
| 283 def download_from_nextprot_ftp(file,target_directory): | 283 def download_from_nextprot_ftp(file,target_directory) : |
| 284 ftp_dir = "pub/current_release/ac_lists/" | 284 ftp_dir = "pub/current_release/ac_lists/" |
| 285 path = os.path.join(target_directory, file) | 285 path = os.path.join(target_directory, file) |
| 286 ftp = ftplib.FTP("ftp.nextprot.org") | 286 ftp = ftplib.FTP("ftp.nextprot.org") |
| 287 ftp.login("anonymous", "anonymous") | 287 ftp.login("anonymous", "anonymous") |
| 288 ftp.cwd(ftp_dir) | 288 ftp.cwd(ftp_dir) |
| 289 ftp.retrbinary("RETR " + file, open(path, 'wb').write) | 289 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
| 290 ftp.quit() | 290 ftp.quit() |
| 291 return path | 291 return (path) |
| 292 | 292 |
| 293 def id_list_from_nextprot_ftp(file) : | 293 def id_list_from_nextprot_ftp(file,target_directory) : |
| 294 ftp_dir = "pub/current_release/ac_lists/" | |
| 295 path = os.path.join(target_directory, file) | |
| 294 ftp = ftplib.FTP("ftp.nextprot.org") | 296 ftp = ftplib.FTP("ftp.nextprot.org") |
| 295 ftp.login("anonymous", "anonymous") | 297 ftp.login("anonymous", "anonymous") |
| 296 r = StringIO() | 298 ftp.cwd(ftp_dir) |
| 297 ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n')) | 299 ftp.retrbinary("RETR " + file, open(path, 'wb').write) |
| 298 ftp.quit() | 300 ftp.quit() |
| 299 r.seek(0) | 301 with open(path,'r') as nextprot_ids : |
| 300 ids = r.readlines() | 302 nextprot_ids = nextprot_ids.read().splitlines() |
| 301 ids = [id.strip('\n') for id in ids] | 303 return (nextprot_ids) |
| 302 | |
| 303 return (ids) | |
| 304 | 304 |
| 305 #return '' if there's no value in a dictionary, avoid error | 305 #return '' if there's no value in a dictionary, avoid error |
| 306 def access_dictionary (dico,key1,key2) : | 306 def access_dictionary (dico,key1,key2) : |
| 307 if key1 in dico : | 307 if key1 in dico : |
| 308 if key2 in dico[key1] : | 308 if key2 in dico[key1] : |
| 542 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | 542 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") |
| 543 | 543 |
| 544 ####################################################################################################### | 544 ####################################################################################################### |
| 545 # 5. nextprot (add protein features) | 545 # 5. nextprot (add protein features) |
| 546 ####################################################################################################### | 546 ####################################################################################################### |
| 547 | |
| 547 def Build_nextprot_ref_file(data_manager_dict,target_directory): | 548 def Build_nextprot_ref_file(data_manager_dict,target_directory): |
| 548 | 549 nextprot_ids_file = "nextprot_ac_list_all.txt" |
| 549 from requests_futures.sessions import FuturesSession | 550 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) |
| 550 from concurrent.futures import ProcessPoolExecutor | |
| 551 | |
| 552 #Get nextprot ids list | |
| 553 ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt") | |
| 554 | 551 |
| 555 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | 552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" |
| 556 path = os.path.join(target_directory,output_file) | 553 path = os.path.join(target_directory,output_file) |
| 557 name = "neXtProt release "+time.strftime("%d-%m-%Y") | 554 name = "neXtProt release "+time.strftime("%d-%m-%Y") |
| 558 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | 555 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") |
| 559 | 556 |
| 560 #open output file to write | |
| 561 output = open(path, 'w') | 557 output = open(path, 'w') |
| 562 writer = csv.writer(output,delimiter="\t") | 558 writer = csv.writer(output,delimiter="\t") |
| 563 writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]) | 559 |
| 564 | 560 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] |
| 565 subset=100 | 561 writer.writerows(nextprot_file) |
| 566 ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)] | 562 |
| 567 | 563 for id in ids : |
| 568 for ids_subset in ids_subsets: | 564 query="https://api.nextprot.org/entry/"+id+".json" |
| 569 | 565 try: |
| 570 #Open concurent sessions | 566 resp = requests.get(url=query) |
| 571 with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session: | 567 except : |
| 572 futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset] | 568 print ("wainting 1 hour before trying again") |
| 573 | 569 time.sleep(3600) |
| 574 for id,future in zip(ids_subset,futures) : | 570 resp = requests.get(url=query) |
| 575 | 571 data = resp.json() |
| 576 #Get json dictionary | 572 |
| 577 try: | 573 #get info from json dictionary |
| 578 res = future.result() | 574 mass_mol = data["entry"]["isoforms"][0]["massAsString"] |
| 579 except: | 575 seq_length = data['entry']["isoforms"][0]["sequenceLength"] |
| 580 print ("sleep 1 hour") | 576 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] |
| 581 time.sleep(3600) | 577 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] |
| 582 res = future.result() | 578 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) |
| 583 data = res.json() | 579 |
| 584 | 580 #put all subcell loc in a set |
| 585 #get info from json dictionary | 581 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : |
| 586 mass_mol = data["entry"]["isoforms"][0]["massAsString"] | 582 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] |
| 587 seq_length = data['entry']["isoforms"][0]["sequenceLength"] | 583 all_subcell_locs = set() |
| 588 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] | 584 for loc in subcell_locs : |
| 589 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] | 585 all_subcell_locs.add(loc['cvTermName']) |
| 590 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) | 586 all_subcell_locs.discard("") |
| 591 | 587 all_subcell_locs = ";".join(all_subcell_locs) |
| 592 #put all subcell loc in a set | 588 else : |
| 593 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : | 589 all_subcell_locs = "NA" |
| 594 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] | 590 |
| 595 all_subcell_locs = set() | 591 #put all subcell loc in a set |
| 596 for loc in subcell_locs : | 592 if ('disease') in data['entry']['annotationsByCategory'].keys() : |
| 597 all_subcell_locs.add(loc['cvTermName']) | 593 diseases = data['entry']['annotationsByCategory']['disease'] |
| 598 all_subcell_locs.discard("") | 594 all_diseases = set() |
| 599 all_subcell_locs = ";".join(all_subcell_locs) | 595 for disease in diseases : |
| 600 else : | 596 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): |
| 601 all_subcell_locs = "NA" | 597 all_diseases.add(disease['cvTermName']) |
| 602 | 598 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) |
| 603 #put all subcell loc in a set | 599 else : all_diseases="NA" |
| 604 if ('disease') in data['entry']['annotationsByCategory'].keys() : | 600 else : |
| 605 diseases = data['entry']['annotationsByCategory']['disease'] | 601 all_diseases="NA" |
| 606 all_diseases = set() | 602 |
| 607 for disease in diseases : | 603 #get all tm domain |
| 608 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): | 604 nb_domains = 0 |
| 609 all_diseases.add(disease['cvTermName']) | 605 if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): |
| 610 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) | 606 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] |
| 611 else : all_diseases="NA" | 607 all_tm_domains = set() |
| 612 else : | 608 for tm in tm_domains : |
| 613 all_diseases="NA" | 609 all_tm_domains.add(tm['cvTermName']) |
| 614 | 610 nb_domains+=1 |
| 615 #get all tm domain | 611 #print "nb domains ++" |
| 616 nb_domains = 0 | 612 #print (nb_domains) |
| 617 if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): | 613 nextprot_file[:] = [] |
| 618 tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] | 614 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) |
| 619 all_tm_domains = set() | 615 writer.writerows(nextprot_file) |
| 620 for tm in tm_domains : | 616 |
| 621 all_tm_domains.add(tm['cvTermName']) | 617 id = str(10000000000 - int(time.strftime("%Y%m%d"))) |
| 622 nb_domains+=1 | |
| 623 | |
| 624 writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) | |
| 625 | |
| 626 id = str(10000000000 - int(time.strftime("%Y%m%d"))) | |
| 627 | 618 |
| 628 data_table_entry = dict(id=id, release=release_id, name = name, value = path) | 619 data_table_entry = dict(id=id, release=release_id, name = name, value = path) |
| 629 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") | 620 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") |
| 630 | 621 |
| 631 ####################################################################################################### | 622 ####################################################################################################### |
