Mercurial > repos > proteore > proteore_data_manager
diff data_manager/resource_building.py @ 62:add6aa698fb0 draft
"planemo upload commit 4747fc3ca8e24e0f6c0cfcde0992780c6d4ef4ff-dirty"
| author | proteore |
|---|---|
| date | Tue, 09 Jun 2020 16:14:33 +0000 |
| parents | da9e74d3c40d |
| children | 54089754ba12 |
line wrap: on
line diff
--- a/data_manager/resource_building.py Tue Jun 09 15:12:03 2020 +0000 +++ b/data_manager/resource_building.py Tue Jun 09 16:14:33 2020 +0000 @@ -280,7 +280,7 @@ ftp.quit() return (path) -def download_from_nextprot_ftp(file,target_directory): +def download_from_nextprot_ftp(file,target_directory) : ftp_dir = "pub/current_release/ac_lists/" path = os.path.join(target_directory, file) ftp = ftplib.FTP("ftp.nextprot.org") @@ -288,19 +288,19 @@ ftp.cwd(ftp_dir) ftp.retrbinary("RETR " + file, open(path, 'wb').write) ftp.quit() - return path + return (path) -def id_list_from_nextprot_ftp(file) : +def id_list_from_nextprot_ftp(file,target_directory) : + ftp_dir = "pub/current_release/ac_lists/" + path = os.path.join(target_directory, file) ftp = ftplib.FTP("ftp.nextprot.org") ftp.login("anonymous", "anonymous") - r = StringIO() - ftp.retrlines("RETR " + file, lambda line: r.write(line + '\n')) + ftp.cwd(ftp_dir) + ftp.retrbinary("RETR " + file, open(path, 'wb').write) ftp.quit() - r.seek(0) - ids = r.readlines() - ids = [id.strip('\n') for id in ids] - - return (ids) + with open(path,'r') as nextprot_ids : + nextprot_ids = nextprot_ids.read().splitlines() + return (nextprot_ids) #return '' if there's no value in a dictionary, avoid error def access_dictionary (dico,key1,key2) : @@ -544,86 +544,77 @@ ####################################################################################################### # 5. nextprot (add protein features) ####################################################################################################### + def Build_nextprot_ref_file(data_manager_dict,target_directory): - - from requests_futures.sessions import FuturesSession - from concurrent.futures import ProcessPoolExecutor - - #Get nextprot ids list - ids = id_list_from_nextprot_ftp("pub/current_release/ac_lists/nextprot_ac_list_all.txt") + nextprot_ids_file = "nextprot_ac_list_all.txt" + ids = id_list_from_nextprot_ftp(nextprot_ids_file,target_directory) output_file = 'nextprot_ref_'+ time.strftime("%d-%m-%Y") + ".tsv" path = os.path.join(target_directory,output_file) name = "neXtProt release "+time.strftime("%d-%m-%Y") release_id = "nextprot_ref_"+time.strftime("%d-%m-%Y") - - #open output file to write + output = open(path, 'w') writer = csv.writer(output,delimiter="\t") - writer.writerow(["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]) - - subset=100 - ids_subsets = [ids[x:x+subset] for x in range(0, len(ids), subset)] - - for ids_subset in ids_subsets: - - #Open concurent sessions - with FuturesSession(executor=ProcessPoolExecutor(max_workers=8)) as session: - futures = [session.get("https://api.nextprot.org/entry/"+id+".json") for id in ids_subset] - - for id,future in zip(ids_subset,futures) : + + nextprot_file=[["NextprotID","MW","SeqLength","IsoPoint","Chr","SubcellLocations","Diseases","TMDomains","ProteinExistence"]] + writer.writerows(nextprot_file) + + for id in ids : + query="https://api.nextprot.org/entry/"+id+".json" + try: + resp = requests.get(url=query) + except : + print ("wainting 1 hour before trying again") + time.sleep(3600) + resp = requests.get(url=query) + data = resp.json() - #Get json dictionary - try: - res = future.result() - except: - print ("sleep 1 hour") - time.sleep(3600) - res = future.result() - data = res.json() - - #get info from json dictionary - mass_mol = data["entry"]["isoforms"][0]["massAsString"] - seq_length = data['entry']["isoforms"][0]["sequenceLength"] - iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] - chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] - protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) + #get info from json dictionary + mass_mol = data["entry"]["isoforms"][0]["massAsString"] + seq_length = data['entry']["isoforms"][0]["sequenceLength"] + iso_elec_point = data['entry']["isoforms"][0]["isoelectricPointAsString"] + chr_loc = data['entry']["chromosomalLocations"][0]["chromosome"] + protein_existence = "PE"+str(data['entry']["overview"]['proteinExistence']['level']) - #put all subcell loc in a set - if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : - subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] - all_subcell_locs = set() - for loc in subcell_locs : - all_subcell_locs.add(loc['cvTermName']) - all_subcell_locs.discard("") - all_subcell_locs = ";".join(all_subcell_locs) - else : - all_subcell_locs = "NA" - - #put all subcell loc in a set - if ('disease') in data['entry']['annotationsByCategory'].keys() : - diseases = data['entry']['annotationsByCategory']['disease'] - all_diseases = set() - for disease in diseases : - if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): - all_diseases.add(disease['cvTermName']) - if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) - else : all_diseases="NA" - else : - all_diseases="NA" + #put all subcell loc in a set + if "subcellular-location" in data['entry']["annotationsByCategory"].keys() : + subcell_locs = data['entry']["annotationsByCategory"]["subcellular-location"] + all_subcell_locs = set() + for loc in subcell_locs : + all_subcell_locs.add(loc['cvTermName']) + all_subcell_locs.discard("") + all_subcell_locs = ";".join(all_subcell_locs) + else : + all_subcell_locs = "NA" + + #put all subcell loc in a set + if ('disease') in data['entry']['annotationsByCategory'].keys() : + diseases = data['entry']['annotationsByCategory']['disease'] + all_diseases = set() + for disease in diseases : + if (disease['cvTermName'] is not None and disease['cvTermName'] != ""): + all_diseases.add(disease['cvTermName']) + if len(all_diseases) > 0 : all_diseases = ";".join(all_diseases) + else : all_diseases="NA" + else : + all_diseases="NA" - #get all tm domain - nb_domains = 0 - if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): - tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] - all_tm_domains = set() - for tm in tm_domains : - all_tm_domains.add(tm['cvTermName']) - nb_domains+=1 + #get all tm domain + nb_domains = 0 + if "transmembrane-region" in data['entry']['annotationsByCategory'].keys(): + tm_domains = data['entry']['annotationsByCategory']["transmembrane-region"] + all_tm_domains = set() + for tm in tm_domains : + all_tm_domains.add(tm['cvTermName']) + nb_domains+=1 + #print "nb domains ++" + #print (nb_domains) + nextprot_file[:] = [] + nextprot_file.append([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) + writer.writerows(nextprot_file) - writer.writerow([id,mass_mol,str(seq_length),iso_elec_point,chr_loc,all_subcell_locs,all_diseases,str(nb_domains),protein_existence]) - - id = str(10000000000 - int(time.strftime("%Y%m%d"))) + id = str(10000000000 - int(time.strftime("%Y%m%d"))) data_table_entry = dict(id=id, release=release_id, name = name, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_nextprot_ref")
