Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 6:f281a1eb83d6 draft
planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
| author | proteore |
|---|---|
| date | Mon, 11 Mar 2019 04:30:28 -0400 |
| parents | 429e7481c392 |
| children | d5badf9de1b0 |
comparison
equal
deleted
inserted
replaced
| 5:429e7481c392 | 6:f281a1eb83d6 |
|---|---|
| 488 json.dump(dico, handle, sort_keys=True) | 488 json.dump(dico, handle, sort_keys=True) |
| 489 | 489 |
| 490 data_table_entry = dict(id=id, name = name, species = species, value = path) | 490 data_table_entry = dict(id=id, name = name, species = species, value = path) |
| 491 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") | 491 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") |
| 492 | 492 |
| 493 ####################################################################################################### | |
| 494 # 5. nextprot (add protein features) | |
| 495 ####################################################################################################### | |
| 496 | |
| 497 def Build_nextprot_ref_file(target_directory): | |
| 498 nextprot_ids_file = "nextprot_ac_list_all.txt" | |
| 499 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) | |
| 500 | |
| 501 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] | |
| 502 for id in ids : | |
| 503 #print (id) | |
| 504 query="https://api.nextprot.org/entry/"+id+".json" | |
| 505 resp = requests.get(url=query) | |
| 506 data = resp.json() | |
| 507 | |
| 508 #get info from json dictionary | |
| 509 mass_mol = data["entry"]["isoforms"][0]["massAsString"] | |
| 510 seq_length = data['entry']["isoforms"][0]["sequenceLength"] | |
| 511 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] | |
| 512 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] | |
| 513 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) | |
| 514 | |
| 515 #put all subcell loc in a set | |
| 516 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : | |
| 517 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] | |
| 518 all_subcell_locs = set() | |
| 519 for loc in subcell_locs : | |
| 520 all_subcell_locs.add(loc['cvTermName']) | |
| 521 all_subcell_locs.discard("") | |
| 522 all_subcell_locs = ";".join(all_subcell_locs) | |
| 523 else : | |
| 524 all_subcell_locs = "NA" | |
| 525 | |
| 526 #put all subcell loc in a set | |
| 527 if ('disease') in data['entry']['annotationsByCategory'].keys() : | |
| 528 diseases = data['entry']['annotationsByCategory']['disease'] | |
| 529 all_diseases = set() | |
| 530 for disease in diseases : | |
| 531 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): | |
| 532 all_diseases.add(disease['cvTermName']) | |
| 533 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) | |
| 534 else : all_diseases="NA" | |
| 535 else : | |
| 536 all_diseases="NA" | |
| 537 | |
| 538 #get all tm domain | |
| 539 nb_domains = 0 | |
| 540 if "domain" in data['entry']['annotationsByCategory'].keys(): | |
| 541 tm_domains = data['entry']['annotationsByCategory']["domain"] | |
| 542 for tm_domain in tm_domains : | |
| 543 if "properties" in tm_domain.keys() and tm_domain['properties']!=[]: | |
| 544 domains = tm_domains["properties"] | |
| 545 for domain in domains : | |
| 546 if domain["name"]=="region structure" and domain["value"]=="Helical" : | |
| 547 nb_domains+=1 | |
| 548 | |
| 549 | |
| 550 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) | |
| 551 | |
| 552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" | |
| 553 path = os.path.join(target_directory,output_file) | |
| 554 name = "neXtProt release "+time.strftime("%d-%m-%Y") | |
| 555 id = "nextprot_ref_"+time.strftime("%d-%m-%Y") | |
| 556 | |
| 557 with open(path, 'w') as output: | |
| 558 writer = csv.writer(output,delimiter="\t") | |
| 559 writer.writerows(nextprot_file) | |
| 560 | |
| 561 data_table_entry = dict(id=id, name = name, value = path) | |
| 562 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref") | |
| 493 | 563 |
| 494 ####################################################################################################### | 564 ####################################################################################################### |
| 495 # Main function | 565 # Main function |
| 496 ####################################################################################################### | 566 ####################################################################################################### |
| 497 def main(): | 567 def main(): |
| 501 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) | 571 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) |
| 502 parser.add_argument("--interactome", metavar = ("PPI")) | 572 parser.add_argument("--interactome", metavar = ("PPI")) |
| 503 parser.add_argument("--species") | 573 parser.add_argument("--species") |
| 504 parser.add_argument("--date") | 574 parser.add_argument("--date") |
| 505 parser.add_argument("-o", "--output") | 575 parser.add_argument("-o", "--output") |
| 576 parser.add_argument("--database") | |
| 506 args = parser.parse_args() | 577 args = parser.parse_args() |
| 507 | 578 |
| 508 data_manager_dict = {} | 579 data_manager_dict = {} |
| 509 # Extract json file params | 580 # Extract json file params |
| 510 filename = args.output | 581 filename = args.output |
| 555 except NameError: | 626 except NameError: |
| 556 interactome=None | 627 interactome=None |
| 557 species=None | 628 species=None |
| 558 if interactome is not None and species is not None: | 629 if interactome is not None and species is not None: |
| 559 PPI_ref_files(data_manager_dict, species, interactome, target_directory) | 630 PPI_ref_files(data_manager_dict, species, interactome, target_directory) |
| 560 | 631 |
| 632 ## Build nextprot ref file for add protein features | |
| 633 try: | |
| 634 database=args.database | |
| 635 except NameError: | |
| 636 database=None | |
| 637 if database is not None : | |
| 638 Build_nextprot_ref_file(target_directory) | |
| 639 | |
| 561 #save info to json file | 640 #save info to json file |
| 562 filename = args.output | 641 filename = args.output |
| 563 open(filename, 'wb').write(to_json_string(data_manager_dict)) | 642 open(filename, 'wb').write(to_json_string(data_manager_dict)) |
| 564 | 643 |
| 565 if __name__ == "__main__": | 644 if __name__ == "__main__": |
