comparison data_manager/resource_building.py @ 20:29cf75c83618 draft

planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
author proteore
date Tue, 07 May 2019 08:37:17 -0400
parents f75c525e0a4a
children 026177e4ff4b
comparison
equal deleted inserted replaced
19:ff724e70dae0 20:29cf75c83618
135 human = species == "Human" 135 human = species == "Human"
136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } 136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] 137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
138 138
139 #header 139 #header
140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] 140 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] 141 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
142
143 #print("header ok")
144 142
145 #get selected.tab and keep only ids of interest 143 #get selected.tab and keep only ids of interest
146 selected_tab_file=species_dict[species]+"_"+files[0] 144 selected_tab_file=species_dict[species]+"_"+files[0]
147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) 145 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
148 with gzip.open(tab_path,"rt") as select : 146 with gzip.open(tab_path,"rt") as select :
150 for line in tab_reader : 148 for line in tab_reader :
151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) 149 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
152 os.remove(tab_path) 150 os.remove(tab_path)
153 151
154 #print("selected_tab ok") 152 #print("selected_tab ok")
153
154 #get uniprot-AC reviewed
155 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+species_dict[species]+"&format=list"
156
157 with requests.Session() as s:
158 download = s.get(query)
159 decoded_content = download.content.decode('utf-8')
160 uniprot_reviewed_list = decoded_content.splitlines()
161
162 for line in tab[1:]:
163 UniProtAC = line[0]
164 if UniProtAC in uniprot_reviewed_list :
165 line.insert(1,UniProtAC)
166 else :
167 line.insert(1,"")
155 168
156 """ 169 """
157 Supplementary ID to get from HUMAN_9606_idmapping.dat : 170 Supplementary ID to get from HUMAN_9606_idmapping.dat :
158 -NextProt,BioGrid,STRING,KEGG 171 -NextProt,BioGrid,STRING,KEGG
159 """ 172 """
202 else : 215 else :
203 line.extend(["","",""]) 216 line.extend(["","",""])
204 217
205 #print ("tab ok") 218 #print ("tab ok")
206 219
207 #add missing nextprot ID for human 220 #add missing nextprot ID for human or replace old ones
208 if human : 221 if human :
209 #build next_dict 222 #build next_dict
210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) 223 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
211 next_dict = {} 224 next_dict = {}
212 for nextid in nextprot_ids : 225 for nextid in nextprot_ids :
215 228
216 #add missing nextprot ID 229 #add missing nextprot ID
217 for line in tab[1:] : 230 for line in tab[1:] :
218 uniprotID=line[0] 231 uniprotID=line[0]
219 nextprotID=line[13] 232 nextprotID=line[13]
220 if nextprotID == '' and uniprotID in next_dict : 233 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
221 line[13]=next_dict[uniprotID] 234 line[13]=next_dict[uniprotID]
222 235
223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" 236 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
224 path = os.path.join(target_directory,output_file) 237 path = os.path.join(target_directory,output_file)
225 238
227 w = csv.writer(out,delimiter='\t') 240 w = csv.writer(out,delimiter='\t')
228 w.writerows(tab) 241 w.writerows(tab)
229 242
230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} 243 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" 244 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") 245 id = str(10000000000 - int(time.strftime("%d%m%Y")))
233 246
234 data_table_entry = dict(id=id, name = name, species = species, value = path) 247 data_table_entry = dict(id=id, name = name, species = species, value = path)
235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) 248 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
236 249
237 def download_from_uniprot_ftp(file,target_directory) : 250 def download_from_uniprot_ftp(file,target_directory) :