Mercurial > repos > proteore > proteore_data_manager

--- a/data_manager/resource_building.py	Tue Jun 09 07:33:26 2020 +0000
+++ b/data_manager/resource_building.py	Tue Jun 09 13:19:26 2020 +0000
@@ -280,7 +280,7 @@
     ftp.quit()
     return (path)

-def download_from_nextprot_ftp(file,target_directory) :
+def download_from_nextprot_ftp(file,target_directory):
     ftp_dir = "pub/current_release/ac_lists/"
     path = os.path.join(target_directory, file)
     ftp = ftplib.FTP("ftp.nextprot.org")
@@ -288,19 +288,19 @@
     ftp.cwd(ftp_dir)
     ftp.retrbinary("RETR " + file, open(path, 'wb').write)
     ftp.quit()
-    return (path)
+    return path

-def id_list_from_nextprot_ftp(file,target_directory) :
-    ftp_dir = "pub/current_release/ac_lists/"
-    path = os.path.join(target_directory, file)
+def id_list_from_nextprot_ftp(file) :
     ftp = ftplib.FTP("ftp.nextprot.org")
     ftp.login("anonymous", "anonymous")
-    ftp.cwd(ftp_dir)
-    ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+    r = StringIO()
+    ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n'))
     ftp.quit()
-    with open(path,'r') as nextprot_ids :
-        nextprot_ids = nextprot_ids.read().splitlines()
-    return (nextprot_ids)
+    r.seek(0)
+    ids = r.readlines()
+    ids = [id.strip('\n') for id in ids]
+
+    return (ids)

 #return '' if there's no value in a dictionary, avoid error
 def access_dictionary (dico,key1,key2) :
@@ -544,77 +544,86 @@
 #######################################################################################################
 # 5. nextprot (add protein features)
 #######################################################################################################
+def Build_nextprot_ref_file(data_manager_dict,target_directory):

-def Build_nextprot_ref_file(data_manager_dict,target_directory):
-    nextprot_ids_file = "nextprot_ac_list_all.txt"
-    ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
+    from requests_futures.sessions import FuturesSession
+    from concurrent.futures import ProcessPoolExecutor
+
+    #Get nextprot ids list
+    ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt")

     output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
     path = os.path.join(target_directory,output_file)
     name = "neXtProt release "+time.strftime("%d-%m-%Y")
     release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
-
+
+    #open output file to write
     output = open(path, 'w')
     writer = csv.writer(output,delimiter="\t")
-
-    nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
-    writer.writerows(nextprot_file)
-
-    for id in ids :
-        query="https://api.nextprot.org/entry/"+id+".json"
-        try:
-            resp = requests.get(url=query)
-        except :
-            print ("wainting 1 hour before trying again")
-            time.sleep(3600)
-            resp = requests.get(url=query)
-        data = resp.json()
+    writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"])
+
+    subset=100
+    ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)]
+
+    for ids_subset in ids_subsets:
+
+        #Open concurent sessions
+        with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session:
+            futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset]
+
+        for id,future in zip(ids_subset,futures) :

-        #get info from json dictionary
-        mass_mol = data["entry"]["isoforms"][0]["massAsString"]
-        seq_length = data['entry']["isoforms"][0]["sequenceLength"]
-        iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
-        chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
-        protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
+            #Get json dictionary
+            try:
+                res = future.result()
+            except:
+                print ("sleep 1 hour")
+                time.sleep(3600)
+                res = future.result()
+            data = res.json()
+
+            #get info from json dictionary
+            mass_mol = data["entry"]["isoforms"][0]["massAsString"]
+            seq_length = data['entry']["isoforms"][0]["sequenceLength"]
+            iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
+            chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
+            protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])

-        #put all subcell loc in a set
-        if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
-            subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
-            all_subcell_locs = set()
-            for loc in subcell_locs :
-                all_subcell_locs.add(loc['cvTermName'])
-            all_subcell_locs.discard("")
-            all_subcell_locs = ";".join(all_subcell_locs)
-        else :
-            all_subcell_locs = "NA"
-
-        #put all subcell loc in a set
-        if ('disease') in data['entry']['annotationsByCategory'].keys() :
-            diseases = data['entry']['annotationsByCategory']['disease']
-            all_diseases = set()
-            for disease in diseases :
-                if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
-                    all_diseases.add(disease['cvTermName'])
-            if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
-            else : all_diseases="NA"
-        else :
-            all_diseases="NA"
+            #put all subcell loc in a set
+            if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
+                subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
+                all_subcell_locs = set()
+                for loc in subcell_locs :
+                    all_subcell_locs.add(loc['cvTermName'])
+                all_subcell_locs.discard("")
+                all_subcell_locs = ";".join(all_subcell_locs)
+            else :
+                all_subcell_locs = "NA"
+
+            #put all subcell loc in a set
+            if ('disease') in data['entry']['annotationsByCategory'].keys() :
+                diseases = data['entry']['annotationsByCategory']['disease']
+                all_diseases = set()
+                for disease in diseases :
+                    if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
+                        all_diseases.add(disease['cvTermName'])
+                if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
+                else : all_diseases="NA"
+            else :
+                all_diseases="NA"

-        #get all tm domain
-        nb_domains = 0
-        if  "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
-            tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
-            all_tm_domains = set()
-            for tm in tm_domains :
-                all_tm_domains.add(tm['cvTermName'])
-                nb_domains+=1
-                #print "nb domains ++"
-                #print (nb_domains)
-        nextprot_file[:] = []
-        nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
-        writer.writerows(nextprot_file)
+            #get all tm domain
+            nb_domains = 0
+            if  "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
+                tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
+                all_tm_domains = set()
+                for tm in tm_domains :
+                    all_tm_domains.add(tm['cvTermName'])
+                    nb_domains+=1

-        id = str(10000000000 - int(time.strftime("%Y%m%d")))
+            writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
+
+    id = str(10000000000 - int(time.strftime("%Y%m%d")))

     data_table_entry = dict(id=id, release=release_id, name = name, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
--- a/data_manager/resource_building.xml	Tue Jun 09 07:33:26 2020 +0000
+++ b/data_manager/resource_building.xml	Tue Jun 09 13:19:26 2020 +0000
@@ -3,13 +3,15 @@
 to create or update reference files for proteore tools
 </description>
 <requirements>
+    <requirement type="package" version="3.8.2">python</requirement>
+    <requirement type="package" version="1.0.0">requests-futures</requirement>
 </requirements>
 <stdio>
   <exit_code range="1:" />
 </stdio>
 <command><![CDATA[

-    python $__tool_directory__/resource_building.py
+    python3 $__tool_directory__/resource_building.py
     #if $database.database == "human_protein_atlas"
         --hpa "$database.tissues"
     #else if $database.database == "peptide_atlas"