comparison data_manager/resource_building.py @ 30:a6cabd3ab71f draft

"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
author proteore
date Thu, 12 Dec 2019 09:26:42 +0000
parents 9a40b72414de
children faeeabb11a4d
comparison
equal deleted inserted replaced
29:9a40b72414de 30:a6cabd3ab71f
136 human = species == "Human" 136 human = species == "Human"
137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
139 139
140 #header 140 #header
141 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] 141 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
142 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] 142 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
143 143
144 #get selected.tab and keep only ids of interest 144 #get selected.tab and keep only ids of interest
145 selected_tab_file=species_dict[species]+"_"+files[0] 145 selected_tab_file=species_dict[species]+"_"+files[0]
146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) 146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
147 with gzip.open(tab_path,"rt") as select : 147 with gzip.open(tab_path,"rt") as select :
148 tab_reader = csv.reader(select,delimiter="\t") 148 tab_reader = csv.reader(select,delimiter="\t")
149 for line in tab_reader : 149 for line in tab_reader :
150 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) 150 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
151 os.remove(tab_path) 151 os.remove(tab_path)
152 152
153 #print("selected_tab ok") 153 #print("selected_tab ok")
154 154
155 #get uniprot-AC reviewed 155 #get uniprot-AC reviewed
160 download = s.get(query) 160 download = s.get(query)
161 decoded_content = download.content.decode('utf-8') 161 decoded_content = download.content.decode('utf-8')
162 uniprot_reviewed_list = decoded_content.splitlines() 162 uniprot_reviewed_list = decoded_content.splitlines()
163 163
164 for line in tab[1:]: 164 for line in tab[1:]:
165 UniProtAC = line[0] 165 UniProtAC = line[1]
166 if UniProtAC not in uniprot_reviewed_list : 166 if UniProtAC not in uniprot_reviewed_list :
167 line[0]=""
168 line[1]="" 167 line[1]=""
168 line[2]=""
169 169
170 """ 170 """
171 Supplementary ID to get from HUMAN_9606_idmapping.dat : 171 Supplementary ID to get from HUMAN_9606_idmapping.dat :
172 -NextProt,BioGrid,STRING,KEGG 172 -NextProt,BioGrid,STRING,KEGG
173 """ 173 """
174 174
175 #there's more id type for human 175 #there's more id type for human
176 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file 176 if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file
177 else : ids = ['BioGrid','STRING','KEGG' ] 177 else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ]
178 unidict = {} 178 unidict = {}
179 179
180 #keep only ids of interest in dictionaries 180 #keep only ids of interest in dictionaries
181 dat_file=species_dict[species]+"_"+files[1] 181 dat_file=species_dict[species]+"_"+files[1]
182 dat_path = download_from_uniprot_ftp(dat_file,target_directory) 182 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
204 if human : 204 if human :
205 if uniprotID in unidict : 205 if uniprotID in unidict :
206 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') 206 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
207 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) 207 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
208 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), 208 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
209 access_dictionary(unidict,uniprotID,'KEGG')]) 209 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
210 else : 210 else :
211 line.extend(["","","",""]) 211 line.extend(["","","","",""])
212 else : 212 else :
213 if uniprotID in unidict : 213 if uniprotID in unidict :
214 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), 214 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
215 access_dictionary(unidict,uniprotID,'KEGG')]) 215 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
216 else : 216 else :
217 line.extend(["","",""]) 217 line.extend(["","","",""])
218 218
219 #print ("tab ok") 219 #print ("tab ok")
220 220
221 #add missing nextprot ID for human or replace old ones 221 #add missing nextprot ID for human or replace old ones
222 if human : 222 if human :
228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) 228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
229 229
230 #add missing nextprot ID 230 #add missing nextprot ID
231 for line in tab[1:] : 231 for line in tab[1:] :
232 uniprotID=line[0] 232 uniprotID=line[0]
233 nextprotID=line[13] 233 nextprotID=line[14]
234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : 234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
235 line[13]=next_dict[uniprotID] 235 line[14]=next_dict[uniprotID]
236 236
237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" 237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
238 path = os.path.join(target_directory,output_file) 238 path = os.path.join(target_directory,output_file)
239 239
240 with open(path,"w") as out : 240 with open(path,"w") as out :