proteore_data_manager: data_manager/resource

comparison data_manager/resource_building.py @ 60:da9e74d3c40d draft

"planemo upload commit a69fdb4cdc110c75fea7439f0d97b67158c1bbbf"

author	proteore
date	Tue, 09 Jun 2020 13:19:26 +0000
parents	149eb9e80717
children	add6aa698fb0

comparison

equal deleted inserted replaced

-:8e60ad16028a
+:da9e74d3c40d
 ftp.cwd(ftp_dir)
 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
 ftp.quit()
 return (path)
-def download_from_nextprot_ftp(file,target_directory) :
+def download_from_nextprot_ftp(file,target_directory):
 ftp_dir = "pub/current_release/ac_lists/"
 path = os.path.join(target_directory, file)
 ftp = ftplib.FTP("ftp.nextprot.org")
 ftp.login("anonymous", "anonymous")
 ftp.cwd(ftp_dir)
 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
 ftp.quit()
-return (path)
+return path
-def id_list_from_nextprot_ftp(file,target_directory) :
+def id_list_from_nextprot_ftp(file) :
-ftp_dir = "pub/current_release/ac_lists/"
-path = os.path.join(target_directory, file)
 ftp = ftplib.FTP("ftp.nextprot.org")
 ftp.login("anonymous", "anonymous")
-ftp.cwd(ftp_dir)
+r = StringIO()
-ftp.retrbinary("RETR " + file, open(path, 'wb').write)
+ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n'))
 ftp.quit()
-with open(path,'r') as nextprot_ids :
+r.seek(0)
-nextprot_ids = nextprot_ids.read().splitlines()
+ids = r.readlines()
-return (nextprot_ids)
+ids = [id.strip('\n') for id in ids]
+return (ids)
 #return '' if there's no value in a dictionary, avoid error
 def access_dictionary (dico,key1,key2) :
 if key1 in dico :
 if key2 in dico[key1] :
 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_"+interactome+"_dictionaries")
 #######################################################################################################
 # 5. nextprot (add protein features)
 #######################################################################################################
 def Build_nextprot_ref_file(data_manager_dict,target_directory):
-nextprot_ids_file = "nextprot_ac_list_all.txt"
-ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory)
+from requests_futures.sessions import FuturesSession
+from concurrent.futures import ProcessPoolExecutor
+#Get nextprot ids list
+ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt")
 output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv"
 path = os.path.join(target_directory,output_file)
 name = "neXtProt release "+time.strftime("%d-%m-%Y")
 release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y")
+#open output file to write
 output = open(path, 'w')
 writer = csv.writer(output,delimiter="\t")
+writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"])
-nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]]
-writer.writerows(nextprot_file)
+subset=100
+ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)]
-for id in ids :
-query="https://api.nextprot.org/entry/"+id+".json"
+for ids_subset in ids_subsets:
-try:
-resp = requests.get(url=query)
+#Open concurent sessions
-except :
+with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session:
-print ("wainting 1 hour before trying again")
+futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset]
-time.sleep(3600)
-resp = requests.get(url=query)
+for id,future in zip(ids_subset,futures) :
-data = resp.json()
+#Get json dictionary
-#get info from json dictionary
+try:
-mass_mol = data["entry"]["isoforms"][0]["massAsString"]
+res = future.result()
-seq_length = data['entry']["isoforms"][0]["sequenceLength"]
+except:
-iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
+print ("sleep 1 hour")
-chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
+time.sleep(3600)
-protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
+res = future.result()
+data = res.json()
-#put all subcell loc in a set
-if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
+#get info from json dictionary
-subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
+mass_mol = data["entry"]["isoforms"][0]["massAsString"]
-all_subcell_locs = set()
+seq_length = data['entry']["isoforms"][0]["sequenceLength"]
-for loc in subcell_locs :
+iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"]
-all_subcell_locs.add(loc['cvTermName'])
+chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"]
-all_subcell_locs.discard("")
+protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level'])
-all_subcell_locs = ";".join(all_subcell_locs)
-else :
+#put all subcell loc in a set
-all_subcell_locs = "NA"
+if "subcellular-location" in data['entry']["annotationsByCategory"].keys() :
+subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"]
-#put all subcell loc in a set
+all_subcell_locs = set()
-if ('disease') in data['entry']['annotationsByCategory'].keys() :
+for loc in subcell_locs :
-diseases = data['entry']['annotationsByCategory']['disease']
+all_subcell_locs.add(loc['cvTermName'])
-all_diseases = set()
+all_subcell_locs.discard("")
-for disease in diseases :
+all_subcell_locs = ";".join(all_subcell_locs)
-if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
+else :
-all_diseases.add(disease['cvTermName'])
+all_subcell_locs = "NA"
-if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
-else : all_diseases="NA"
+#put all subcell loc in a set
-else :
+if ('disease') in data['entry']['annotationsByCategory'].keys() :
-all_diseases="NA"
+diseases = data['entry']['annotationsByCategory']['disease']
+all_diseases = set()
-#get all tm domain
+for disease in diseases :
-nb_domains = 0
+if (disease['cvTermName'] is not None and disease['cvTermName'] != ""):
-if  "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
+all_diseases.add(disease['cvTermName'])
-tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
+if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases)
-all_tm_domains = set()
+else : all_diseases="NA"
-for tm in tm_domains :
+else :
-all_tm_domains.add(tm['cvTermName'])
+all_diseases="NA"
-nb_domains+=1
-#print "nb domains ++"
+#get all tm domain
-#print (nb_domains)
+nb_domains = 0
-nextprot_file[:] = []
+if  "transmembrane-region" in data['entry']['annotationsByCategory'].keys():
-nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
+tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"]
-writer.writerows(nextprot_file)
+all_tm_domains = set()
+for tm in tm_domains :
-id = str(10000000000 - int(time.strftime("%Y%m%d")))
+all_tm_domains.add(tm['cvTermName'])
+nb_domains+=1
+writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence])
+id = str(10000000000 - int(time.strftime("%Y%m%d")))
 data_table_entry = dict(id=id, release=release_id, name = name, value = path)
 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
 #######################################################################################################

Mercurial > repos > proteore > proteore_data_manager

comparison data_manager/resource_building.py @ 60:da9e74d3c40d draft