comparison data_manager/resource_building.py @ 6:f281a1eb83d6 draft

planemo upload commit 5df487b88ce2146c4be8e1d9f419006583185f6a-dirty
author proteore
date Mon, 11 Mar 2019 04:30:28 -0400
parents 429e7481c392
children d5badf9de1b0
comparison
equal deleted inserted replaced
5:429e7481c392 6:f281a1eb83d6
488 json.dump(dico, handle, sort_keys=True) 488 json.dump(dico, handle, sort_keys=True)
489 489
490 data_table_entry = dict(id=id, name = name, species = species, value = path) 490 data_table_entry = dict(id=id, name = name, species = species, value = path)
491 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries") 491 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
492 492
493 #######################################################################################################
494 # 5. nextprot (add protein features)
495 #######################################################################################################
496
497 def Build_nextprot_ref_file(target_directory):
498 nextprot_ids_file = "nextprot_ac_list_all.txt"
499 ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
500
501 nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
502 for id in ids :
503 #print (id)
504 query="https://api.nextprot.org/entry/"+id+".json"
505 resp = requests.get(url=query)
506 data = resp.json()
507
508 #get info from json dictionary
509 mass_mol = data["entry"]["isoforms"][0]["massAsString"]
510 seq_length = data['entry']["isoforms"][0]["sequenceLength"]
511 iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
512 chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
513 protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
514
515 #put all subcell loc in a set
516 if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
517 subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
518 all_subcell_locs = set()
519 for loc in subcell_locs :
520 all_subcell_locs.add(loc['cvTermName'])
521 all_subcell_locs.discard("")
522 all_subcell_locs = ";".join(all_subcell_locs)
523 else :
524 all_subcell_locs = "NA"
525
526 #put all subcell loc in a set
527 if ('disease') in data['entry']['annotationsByCategory'].keys() :
528 diseases = data['entry']['annotationsByCategory']['disease']
529 all_diseases = set()
530 for disease in diseases :
531 if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
532 all_diseases.add(disease['cvTermName'])
533 if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
534 else : all_diseases="NA"
535 else :
536 all_diseases="NA"
537
538 #get all tm domain
539 nb_domains = 0
540 if "domain" in data['entry']['annotationsByCategory'].keys():
541 tm_domains = data['entry']['annotationsByCategory']["domain"]
542 for tm_domain in tm_domains :
543 if "properties" in tm_domain.keys() and tm_domain['properties']!=[]:
544 domains = tm_domains["properties"]
545 for domain in domains :
546 if domain["name"]=="region structure" and domain["value"]=="Helical" :
547 nb_domains+=1
548
549
550 nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
551
552 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
553 path = os.path.join(target_directory,output_file)
554 name = "neXtProt release "+time.strftime("%d-%m-%Y")
555 id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
556
557 with open(path, 'w') as output:
558 writer = csv.writer(output,delimiter="\t")
559 writer.writerows(nextprot_file)
560
561 data_table_entry = dict(id=id, name = name, value = path)
562 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
493 563
494 ####################################################################################################### 564 #######################################################################################################
495 # Main function 565 # Main function
496 ####################################################################################################### 566 #######################################################################################################
497 def main(): 567 def main():
501 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES")) 571 parser.add_argument("--id_mapping", metavar = ("ID_MAPPING_SPECIES"))
502 parser.add_argument("--interactome", metavar = ("PPI")) 572 parser.add_argument("--interactome", metavar = ("PPI"))
503 parser.add_argument("--species") 573 parser.add_argument("--species")
504 parser.add_argument("--date") 574 parser.add_argument("--date")
505 parser.add_argument("-o", "--output") 575 parser.add_argument("-o", "--output")
576 parser.add_argument("--database")
506 args = parser.parse_args() 577 args = parser.parse_args()
507 578
508 data_manager_dict = {} 579 data_manager_dict = {}
509 # Extract json file params 580 # Extract json file params
510 filename = args.output 581 filename = args.output
555 except NameError: 626 except NameError:
556 interactome=None 627 interactome=None
557 species=None 628 species=None
558 if interactome is not None and species is not None: 629 if interactome is not None and species is not None:
559 PPI_ref_files(data_manager_dict, species, interactome, target_directory) 630 PPI_ref_files(data_manager_dict, species, interactome, target_directory)
560 631
632 ## Build nextprot ref file for add protein features
633 try:
634 database=args.database
635 except NameError:
636 database=None
637 if database is not None :
638 Build_nextprot_ref_file(target_directory)
639
561 #save info to json file 640 #save info to json file
562 filename = args.output 641 filename = args.output
563 open(filename, 'wb').write(to_json_string(data_manager_dict)) 642 open(filename, 'wb').write(to_json_string(data_manager_dict))
564 643
565 if __name__ == "__main__": 644 if __name__ == "__main__":