Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 30:a6cabd3ab71f draft
"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"
| author | proteore |
|---|---|
| date | Thu, 12 Dec 2019 09:26:42 +0000 |
| parents | 9a40b72414de |
| children | faeeabb11a4d |
comparison
equal
deleted
inserted
replaced
| 29:9a40b72414de | 30:a6cabd3ab71f |
|---|---|
| 136 human = species == "Human" | 136 human = species == "Human" |
| 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } | 137 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } |
| 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | 138 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
| 139 | 139 |
| 140 #header | 140 #header |
| 141 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | 141 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] |
| 142 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | 142 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']] |
| 143 | 143 |
| 144 #get selected.tab and keep only ids of interest | 144 #get selected.tab and keep only ids of interest |
| 145 selected_tab_file=species_dict[species]+"_"+files[0] | 145 selected_tab_file=species_dict[species]+"_"+files[0] |
| 146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | 146 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
| 147 with gzip.open(tab_path,"rt") as select : | 147 with gzip.open(tab_path,"rt") as select : |
| 148 tab_reader = csv.reader(select,delimiter="\t") | 148 tab_reader = csv.reader(select,delimiter="\t") |
| 149 for line in tab_reader : | 149 for line in tab_reader : |
| 150 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | 150 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
| 151 os.remove(tab_path) | 151 os.remove(tab_path) |
| 152 | 152 |
| 153 #print("selected_tab ok") | 153 #print("selected_tab ok") |
| 154 | 154 |
| 155 #get uniprot-AC reviewed | 155 #get uniprot-AC reviewed |
| 160 download = s.get(query) | 160 download = s.get(query) |
| 161 decoded_content = download.content.decode('utf-8') | 161 decoded_content = download.content.decode('utf-8') |
| 162 uniprot_reviewed_list = decoded_content.splitlines() | 162 uniprot_reviewed_list = decoded_content.splitlines() |
| 163 | 163 |
| 164 for line in tab[1:]: | 164 for line in tab[1:]: |
| 165 UniProtAC = line[0] | 165 UniProtAC = line[1] |
| 166 if UniProtAC not in uniprot_reviewed_list : | 166 if UniProtAC not in uniprot_reviewed_list : |
| 167 line[0]="" | |
| 168 line[1]="" | 167 line[1]="" |
| 168 line[2]="" | |
| 169 | 169 |
| 170 """ | 170 """ |
| 171 Supplementary ID to get from HUMAN_9606_idmapping.dat : | 171 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
| 172 -NextProt,BioGrid,STRING,KEGG | 172 -NextProt,BioGrid,STRING,KEGG |
| 173 """ | 173 """ |
| 174 | 174 |
| 175 #there's more id type for human | 175 #there's more id type for human |
| 176 if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ] #ids to get from dat_file | 176 if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ] #ids to get from dat_file |
| 177 else : ids = ['BioGrid','STRING','KEGG' ] | 177 else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ] |
| 178 unidict = {} | 178 unidict = {} |
| 179 | 179 |
| 180 #keep only ids of interest in dictionaries | 180 #keep only ids of interest in dictionaries |
| 181 dat_file=species_dict[species]+"_"+files[1] | 181 dat_file=species_dict[species]+"_"+files[1] |
| 182 dat_path = download_from_uniprot_ftp(dat_file,target_directory) | 182 dat_path = download_from_uniprot_ftp(dat_file,target_directory) |
| 204 if human : | 204 if human : |
| 205 if uniprotID in unidict : | 205 if uniprotID in unidict : |
| 206 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') | 206 nextprot = access_dictionary(unidict,uniprotID,'neXtProt') |
| 207 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) | 207 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0]) |
| 208 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | 208 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
| 209 access_dictionary(unidict,uniprotID,'KEGG')]) | 209 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) |
| 210 else : | 210 else : |
| 211 line.extend(["","","",""]) | 211 line.extend(["","","","",""]) |
| 212 else : | 212 else : |
| 213 if uniprotID in unidict : | 213 if uniprotID in unidict : |
| 214 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), | 214 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'), |
| 215 access_dictionary(unidict,uniprotID,'KEGG')]) | 215 access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')]) |
| 216 else : | 216 else : |
| 217 line.extend(["","",""]) | 217 line.extend(["","","",""]) |
| 218 | 218 |
| 219 #print ("tab ok") | 219 #print ("tab ok") |
| 220 | 220 |
| 221 #add missing nextprot ID for human or replace old ones | 221 #add missing nextprot ID for human or replace old ones |
| 222 if human : | 222 if human : |
| 228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) | 228 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt")) |
| 229 | 229 |
| 230 #add missing nextprot ID | 230 #add missing nextprot ID |
| 231 for line in tab[1:] : | 231 for line in tab[1:] : |
| 232 uniprotID=line[0] | 232 uniprotID=line[0] |
| 233 nextprotID=line[13] | 233 nextprotID=line[14] |
| 234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : | 234 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : |
| 235 line[13]=next_dict[uniprotID] | 235 line[14]=next_dict[uniprotID] |
| 236 | 236 |
| 237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | 237 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
| 238 path = os.path.join(target_directory,output_file) | 238 path = os.path.join(target_directory,output_file) |
| 239 | 239 |
| 240 with open(path,"w") as out : | 240 with open(path,"w") as out : |
