comparison data_manager/resource_building.py @ 31:faeeabb11a4d draft

"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
author proteore
date Wed, 08 Jan 2020 09:53:34 +0000
parents a6cabd3ab71f
children ec1febc6672e
comparison
equal deleted inserted replaced
30:a6cabd3ab71f 31:faeeabb11a4d
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 """ 2 """
3 The purpose of this script is to create source files from different databases to be used in other proteore tools 3 The purpose of this script is to create source files from different databases to be used in other proteore tools
4 """ 4 """
5 5
6 import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile 6 import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile
7 from io import BytesIO 7 from io import BytesIO
8 from zipfile import ZipFile 8 from zipfile import ZipFile
9 from galaxy.util.json import from_json_string, to_json_string 9 from galaxy.util.json import from_json_string, to_json_string
10 10
11 ####################################################################################################### 11 #######################################################################################################
129 # 3. ID mapping file 129 # 3. ID mapping file
130 ####################################################################################################### 130 #######################################################################################################
131 import ftplib, gzip 131 import ftplib, gzip
132 csv.field_size_limit(sys.maxsize) # to handle big files 132 csv.field_size_limit(sys.maxsize) # to handle big files
133 133
134 def id_mapping_sources (data_manager_dict, species, target_directory) : 134 def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :
135 135
136 human = species == "Human" 136 human = species == "Human"
137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
139 archive = os.path.join(tool_data_path, "ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))
140 os.mkdir(archive)
139 141
140 #header 142 #header
141 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] 143 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
142 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] 144 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
143 145
146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) 148 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
147 with gzip.open(tab_path,"rt") as select : 149 with gzip.open(tab_path,"rt") as select :
148 tab_reader = csv.reader(select,delimiter="\t") 150 tab_reader = csv.reader(select,delimiter="\t")
149 for line in tab_reader : 151 for line in tab_reader :
150 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) 152 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
151 os.remove(tab_path) 153 shutil.move(tab_path, archive)
152
153 #print("selected_tab ok") 154 #print("selected_tab ok")
154 155
155 #get uniprot-AC reviewed 156 #get uniprot-AC reviewed
156 organism = species_dict[species].split("_")[1] 157 organism = species_dict[species].split("_")[1]
157 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list" 158 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
159 with requests.Session() as s: 160 with requests.Session() as s:
160 download = s.get(query) 161 download = s.get(query)
161 decoded_content = download.content.decode('utf-8') 162 decoded_content = download.content.decode('utf-8')
162 uniprot_reviewed_list = decoded_content.splitlines() 163 uniprot_reviewed_list = decoded_content.splitlines()
163 164
165 #save reviewed list
166 reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt')
167 with open(reviewed_list_path,w) as reviewed_list_file:
168 for id in uniprot_reviewed_list:
169 reviewed_list_file.write(id+"\n")
170 shutil.move(reviewed_list_path, archive)
171
172 #remove unreviewed uniprot-AC
164 for line in tab[1:]: 173 for line in tab[1:]:
165 UniProtAC = line[1] 174 UniProtAC = line[1]
166 if UniProtAC not in uniprot_reviewed_list : 175 if UniProtAC not in uniprot_reviewed_list :
167 line[1]="" 176 line[1]=""
168 line[2]=""
169 177
170 """ 178 """
171 Supplementary ID to get from HUMAN_9606_idmapping.dat : 179 Supplementary ID to get from HUMAN_9606_idmapping.dat :
172 -NextProt,BioGrid,STRING,KEGG 180 -NextProt,BioGrid,STRING,KEGG
173 """ 181 """
192 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary 200 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id]) #if there is already a value in the dictionnary
193 else : 201 else :
194 unidict[uniprotID].update({ id_type : cor_id }) 202 unidict[uniprotID].update({ id_type : cor_id })
195 elif id_type in ids : 203 elif id_type in ids :
196 unidict[uniprotID]={id_type : cor_id} 204 unidict[uniprotID]={id_type : cor_id}
197 os.remove(dat_path) 205 shutil.move(dat_path, archive)
198 206
199 #print("dat_file ok") 207 #print("dat_file ok")
200 208
201 #add ids from idmapping.dat to the final tab 209 #add ids from idmapping.dat to the final tab
202 for line in tab[1:] : 210 for line in tab[1:] :
219 #print ("tab ok") 227 #print ("tab ok")
220 228
221 #add missing nextprot ID for human or replace old ones 229 #add missing nextprot ID for human or replace old ones
222 if human : 230 if human :
223 #build next_dict 231 #build next_dict
224 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) 232 nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
233 with open(nextprot_path,'r') as nextprot_ids :
234 nextprot_ids = nextprot_ids.read().splitlines()
235 shutil.move(nextprot_path,archive)
225 next_dict = {} 236 next_dict = {}
226 for nextid in nextprot_ids : 237 for nextid in nextprot_ids :
227 next_dict[nextid.replace("NX_","")] = nextid 238 next_dict[nextid.replace("NX_","")] = nextid
228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) 239 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
229 240
238 path = os.path.join(target_directory,output_file) 249 path = os.path.join(target_directory,output_file)
239 250
240 with open(path,"w") as out : 251 with open(path,"w") as out :
241 w = csv.writer(out,delimiter='\t') 252 w = csv.writer(out,delimiter='\t')
242 w.writerows(tab) 253 w.writerows(tab)
254
255 subprocess.call(['tar', '-zczf', archive+".tar.gz", archive])
243 256
244 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} 257 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
245 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" 258 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
246 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") 259 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
247 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order 260 id = str(10000000000 - int(time.strftime("%Y%m%d"))) #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
265 ftp = ftplib.FTP("ftp.nextprot.org") 278 ftp = ftplib.FTP("ftp.nextprot.org")
266 ftp.login("anonymous", "anonymous") 279 ftp.login("anonymous", "anonymous")
267 ftp.cwd(ftp_dir) 280 ftp.cwd(ftp_dir)
268 ftp.retrbinary("RETR " + file, open(path, 'wb').write) 281 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
269 ftp.quit() 282 ftp.quit()
270 with open(path,'r') as nextprot_ids : 283
271 nextprot_ids = nextprot_ids.read().splitlines() 284 return (path)
272 return (nextprot_ids)
273 285
274 #return '' if there's no value in a dictionary, avoid error 286 #return '' if there's no value in a dictionary, avoid error
275 def access_dictionary (dico,key1,key2) : 287 def access_dictionary (dico,key1,key2) :
276 if key1 in dico : 288 if key1 in dico :
277 if key2 in dico[key1] : 289 if key2 in dico[key1] :
595 parser.add_argument("--interactome", metavar = ("PPI")) 607 parser.add_argument("--interactome", metavar = ("PPI"))
596 parser.add_argument("--species") 608 parser.add_argument("--species")
597 parser.add_argument("--date") 609 parser.add_argument("--date")
598 parser.add_argument("-o", "--output") 610 parser.add_argument("-o", "--output")
599 parser.add_argument("--database") 611 parser.add_argument("--database")
612 parser.add_argument("--tool_data_path")
600 args = parser.parse_args() 613 args = parser.parse_args()
601 614
602 data_manager_dict = {} 615 data_manager_dict = {}
603 # Extract json file params 616 # Extract json file params
604 filename = args.output 617 filename = args.output
625 peptide_atlas = None 638 peptide_atlas = None
626 if peptide_atlas is not None: 639 if peptide_atlas is not None:
627 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" 640 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
628 peptide_atlas = peptide_atlas.split(",") 641 peptide_atlas = peptide_atlas.split(",")
629 for pa_tissue in peptide_atlas: 642 for pa_tissue in peptide_atlas:
630 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) 643 peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory, args.tool_data_path)
631 644
632 ## Download ID_mapping source file from Uniprot 645 ## Download ID_mapping source file from Uniprot
633 try: 646 try:
634 id_mapping=args.id_mapping 647 id_mapping=args.id_mapping
635 except NameError: 648 except NameError: