Mercurial > repos > proteore > proteore_data_manager
changeset 60:da9e74d3c40d draft
"planemo upload commit a69fdb4cdc110c75fea7439f0d97b67158c1bbbf"
| author | proteore |
|---|---|
| date | Tue, 09 Jun 2020 13:19:26 +0000 |
| parents | 8e60ad16028a |
| children | 5504538d24f6 |
| files | data_manager/resource_building.py data_manager/resource_building.xml |
| diffstat | 2 files changed, 81 insertions(+), 70 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/resource_building.py Tue Jun 09 07:33:26 2020 +0000 +++ b/data_manager/resource_building.py Tue Jun 09 13:19:26 2020 +0000 @@ -280,7 +280,7 @@ ftp.quit() return (path) -def download_from_nextprot_ftp(file,target_directory) : +def download_from_nextprot_ftp(file,target_directory): ftp_dir = "pub/current_release/ac_lists/" path = os.path.join(target_directory, file) ftp = ftplib.FTP("ftp.nextprot.org") @@ -288,19 +288,19 @@ ftp.cwd(ftp_dir) ftp.retrbinary("RETR " + file, open(path, 'wb').write) ftp.quit() - return (path) + return path -def id_list_from_nextprot_ftp(file,target_directory) : - ftp_dir = "pub/current_release/ac_lists/" - path = os.path.join(target_directory, file) +def id_list_from_nextprot_ftp(file) : ftp = ftplib.FTP("ftp.nextprot.org") ftp.login("anonymous", "anonymous") - ftp.cwd(ftp_dir) - ftp.retrbinary("RETR " + file, open(path, 'wb').write) + r = StringIO() + ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n')) ftp.quit() - with open(path,'r') as nextprot_ids : - nextprot_ids = nextprot_ids.read().splitlines() - return (nextprot_ids) + r.seek(0) + ids = r.readlines() + ids = [id.strip('\n') for id in ids] + + return (ids) #return '' if there's no value in a dictionary, avoid error def access_dictionary (dico,key1,key2) : @@ -544,77 +544,86 @@ ####################################################################################################### # 5. nextprot (add protein features) ####################################################################################################### +def Build_nextprot_ref_file(data_manager_dict,target_directory): -def Build_nextprot_ref_file(data_manager_dict,target_directory): - nextprot_ids_file = "nextprot_ac_list_all.txt" - ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) + from requests_futures.sessions import FuturesSession + from concurrent.futures import ProcessPoolExecutor + + #Get nextprot ids list + ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt") output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" path = os.path.join(target_directory,output_file) name = "neXtProt release "+time.strftime("%d-%m-%Y") release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") - + + #open output file to write output = open(path, 'w') writer = csv.writer(output,delimiter="\t") - - nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] - writer.writerows(nextprot_file) - - for id in ids : - query="https://api.nextprot.org/entry/"+id+".json" - try: - resp = requests.get(url=query) - except : - print ("wainting 1 hour before trying again") - time.sleep(3600) - resp = requests.get(url=query) - data = resp.json() + writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]) + + subset=100 + ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)] + + for ids_subset in ids_subsets: + + #Open concurent sessions + with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session: + futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset] + + for id,future in zip(ids_subset,futures) : - #get info from json dictionary - mass_mol = data["entry"]["isoforms"][0]["massAsString"] - seq_length = data['entry']["isoforms"][0]["sequenceLength"] - iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] - chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] - protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) + #Get json dictionary + try: + res = future.result() + except: + print ("sleep 1 hour") + time.sleep(3600) + res = future.result() + data = res.json() + + #get info from json dictionary + mass_mol = data["entry"]["isoforms"][0]["massAsString"] + seq_length = data['entry']["isoforms"][0]["sequenceLength"] + iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] + chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] + protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) - #put all subcell loc in a set - if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : - subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] - all_subcell_locs = set() - for loc in subcell_locs : - all_subcell_locs.add(loc['cvTermName']) - all_subcell_locs.discard("") - all_subcell_locs = ";".join(all_subcell_locs) - else : - all_subcell_locs = "NA" - - #put all subcell loc in a set - if ('disease') in data['entry']['annotationsByCategory'].keys() : - diseases = data['entry']['annotationsByCategory']['disease'] - all_diseases = set() - for disease in diseases : - if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): - all_diseases.add(disease['cvTermName']) - if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) - else : all_diseases="NA" - else : - all_diseases="NA" + #put all subcell loc in a set + if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : + subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] + all_subcell_locs = set() + for loc in subcell_locs : + all_subcell_locs.add(loc['cvTermName']) + all_subcell_locs.discard("") + all_subcell_locs = ";".join(all_subcell_locs) + else : + all_subcell_locs = "NA" + + #put all subcell loc in a set + if ('disease') in data['entry']['annotationsByCategory'].keys() : + diseases = data['entry']['annotationsByCategory']['disease'] + all_diseases = set() + for disease in diseases : + if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): + all_diseases.add(disease['cvTermName']) + if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) + else : all_diseases="NA" + else : + all_diseases="NA" - #get all tm domain - nb_domains = 0 - if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): - tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] - all_tm_domains = set() - for tm in tm_domains : - all_tm_domains.add(tm['cvTermName']) - nb_domains+=1 - #print "nb domains ++" - #print (nb_domains) - nextprot_file[:] = [] - nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) - writer.writerows(nextprot_file) + #get all tm domain + nb_domains = 0 + if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): + tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] + all_tm_domains = set() + for tm in tm_domains : + all_tm_domains.add(tm['cvTermName']) + nb_domains+=1 - id = str(10000000000 - int(time.strftime("%Y%m%d"))) + writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) + + id = str(10000000000 - int(time.strftime("%Y%m%d"))) data_table_entry = dict(id=id, release=release_id, name = name, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
--- a/data_manager/resource_building.xml Tue Jun 09 07:33:26 2020 +0000 +++ b/data_manager/resource_building.xml Tue Jun 09 13:19:26 2020 +0000 @@ -3,13 +3,15 @@ to create or update reference files for proteore tools </description> <requirements> + <requirement type="package" version="3.8.2">python</requirement> + <requirement type="package" version="1.0.0">requests-futures</requirement> </requirements> <stdio> <exit_code range="1:" /> </stdio> <command><