# HG changeset patch
# User proteore
# Date 1591708766 0
# Node ID da9e74d3c40d493e549cd8c60a5bf022d18fd4bc
# Parent 8e60ad16028ad76b63c847bc5cac06f9f8921e68
"planemo upload commit a69fdb4cdc110c75fea7439f0d97b67158c1bbbf"
diff -r 8e60ad16028a -r da9e74d3c40d data_manager/resource_building.py
--- a/data_manager/resource_building.py Tue Jun 09 07:33:26 2020 +0000
+++ b/data_manager/resource_building.py Tue Jun 09 13:19:26 2020 +0000
@@ -280,7 +280,7 @@
ftp.quit()
return (path)
-def download_from_nextprot_ftp(file,target_directory) :
+def download_from_nextprot_ftp(file,target_directory):
ftp_dir = "pub/current_release/ac_lists/"
path = os.path.join(target_directory, file)
ftp = ftplib.FTP("ftp.nextprot.org")
@@ -288,19 +288,19 @@
ftp.cwd(ftp_dir)
ftp.retrbinary("RETR " + file, open(path, 'wb').write)
ftp.quit()
- return (path)
+ return path
-def id_list_from_nextprot_ftp(file,target_directory) :
- ftp_dir = "pub/current_release/ac_lists/"
- path = os.path.join(target_directory, file)
+def id_list_from_nextprot_ftp(file) :
ftp = ftplib.FTP("ftp.nextprot.org")
ftp.login("anonymous", "anonymous")
- ftp.cwd(ftp_dir)
- ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+ r = StringIO()
+ ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n'))
ftp.quit()
- with open(path,'r') as nextprot_ids :
- nextprot_ids = nextprot_ids.read().splitlines()
- return (nextprot_ids)
+ r.seek(0)
+ ids = r.readlines()
+ ids = [id.strip('\n') for id in ids]
+
+ return (ids)
#return '' if there's no value in a dictionary, avoid error
def access_dictionary (dico,key1,key2) :
@@ -544,77 +544,86 @@
#######################################################################################################
# 5. nextprot (add protein features)
#######################################################################################################
+def Build_nextprot_ref_file(data_manager_dict,target_directory):
-def Build_nextprot_ref_file(data_manager_dict,target_directory):
- nextprot_ids_file = "nextprot_ac_list_all.txt"
- ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
+ from requests_futures.sessions import FuturesSession
+ from concurrent.futures import ProcessPoolExecutor
+
+ #Get nextprot ids list
+ ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt")
output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
path = os.path.join(target_directory,output_file)
name = "neXtProt release "+time.strftime("%d-%m-%Y")
release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
-
+
+ #open output file to write
output = open(path, 'w')
writer = csv.writer(output,delimiter="\t")
-
- nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
- writer.writerows(nextprot_file)
-
- for id in ids :
- query="https://api.nextprot.org/entry/"+id+".json"
- try:
- resp = requests.get(url=query)
- except :
- print ("wainting 1 hour before trying again")
- time.sleep(3600)
- resp = requests.get(url=query)
- data = resp.json()
+ writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"])
+
+ subset=100
+ ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)]
+
+ for ids_subset in ids_subsets:
+
+ #Open concurent sessions
+ with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session:
+ futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset]
+
+ for id,future in zip(ids_subset,futures) :
- #get info from json dictionary
- mass_mol = data["entry"]["isoforms"][0]["massAsString"]
- seq_length = data['entry']["isoforms"][0]["sequenceLength"]
- iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
- chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
- protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
+ #Get json dictionary
+ try:
+ res = future.result()
+ except:
+ print ("sleep 1 hour")
+ time.sleep(3600)
+ res = future.result()
+ data = res.json()
+
+ #get info from json dictionary
+ mass_mol = data["entry"]["isoforms"][0]["massAsString"]
+ seq_length = data['entry']["isoforms"][0]["sequenceLength"]
+ iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
+ chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
+ protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
- #put all subcell loc in a set
- if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
- subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
- all_subcell_locs = set()
- for loc in subcell_locs :
- all_subcell_locs.add(loc['cvTermName'])
- all_subcell_locs.discard("")
- all_subcell_locs = ";".join(all_subcell_locs)
- else :
- all_subcell_locs = "NA"
-
- #put all subcell loc in a set
- if ('disease') in data['entry']['annotationsByCategory'].keys() :
- diseases = data['entry']['annotationsByCategory']['disease']
- all_diseases = set()
- for disease in diseases :
- if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
- all_diseases.add(disease['cvTermName'])
- if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
- else : all_diseases="NA"
- else :
- all_diseases="NA"
+ #put all subcell loc in a set
+ if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
+ subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
+ all_subcell_locs = set()
+ for loc in subcell_locs :
+ all_subcell_locs.add(loc['cvTermName'])
+ all_subcell_locs.discard("")
+ all_subcell_locs = ";".join(all_subcell_locs)
+ else :
+ all_subcell_locs = "NA"
+
+ #put all subcell loc in a set
+ if ('disease') in data['entry']['annotationsByCategory'].keys() :
+ diseases = data['entry']['annotationsByCategory']['disease']
+ all_diseases = set()
+ for disease in diseases :
+ if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
+ all_diseases.add(disease['cvTermName'])
+ if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
+ else : all_diseases="NA"
+ else :
+ all_diseases="NA"
- #get all tm domain
- nb_domains = 0
- if "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
- tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
- all_tm_domains = set()
- for tm in tm_domains :
- all_tm_domains.add(tm['cvTermName'])
- nb_domains+=1
- #print "nb domains ++"
- #print (nb_domains)
- nextprot_file[:] = []
- nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
- writer.writerows(nextprot_file)
+ #get all tm domain
+ nb_domains = 0
+ if "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
+ tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
+ all_tm_domains = set()
+ for tm in tm_domains :
+ all_tm_domains.add(tm['cvTermName'])
+ nb_domains+=1
- id = str(10000000000 - int(time.strftime("%Y%m%d")))
+ writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
+
+ id = str(10000000000 - int(time.strftime("%Y%m%d")))
data_table_entry = dict(id=id, release=release_id, name = name, value = path)
_add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
diff -r 8e60ad16028a -r da9e74d3c40d data_manager/resource_building.xml
--- a/data_manager/resource_building.xml Tue Jun 09 07:33:26 2020 +0000
+++ b/data_manager/resource_building.xml Tue Jun 09 13:19:26 2020 +0000
@@ -3,13 +3,15 @@
to create or update reference files for proteore tools
+ python
+ requests-futures