Mercurial > repos > proteore > proteore_data_manager
comparison data_manager/resource_building.py @ 20:29cf75c83618 draft
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
| author | proteore |
|---|---|
| date | Tue, 07 May 2019 08:37:17 -0400 |
| parents | f75c525e0a4a |
| children | 026177e4ff4b |
comparison
equal
deleted
inserted
replaced
| 19:ff724e70dae0 | 20:29cf75c83618 |
|---|---|
| 135 human = species == "Human" | 135 human = species == "Human" |
| 136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } | 136 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } |
| 137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] | 137 files=["idmapping_selected.tab.gz","idmapping.dat.gz"] |
| 138 | 138 |
| 139 #header | 139 #header |
| 140 if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] | 140 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] |
| 141 else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] | 141 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] |
| 142 | |
| 143 #print("header ok") | |
| 144 | 142 |
| 145 #get selected.tab and keep only ids of interest | 143 #get selected.tab and keep only ids of interest |
| 146 selected_tab_file=species_dict[species]+"_"+files[0] | 144 selected_tab_file=species_dict[species]+"_"+files[0] |
| 147 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) | 145 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory) |
| 148 with gzip.open(tab_path,"rt") as select : | 146 with gzip.open(tab_path,"rt") as select : |
| 150 for line in tab_reader : | 148 for line in tab_reader : |
| 151 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) | 149 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) |
| 152 os.remove(tab_path) | 150 os.remove(tab_path) |
| 153 | 151 |
| 154 #print("selected_tab ok") | 152 #print("selected_tab ok") |
| 153 | |
| 154 #get uniprot-AC reviewed | |
| 155 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+species_dict[species]+"&format=list" | |
| 156 | |
| 157 with requests.Session() as s: | |
| 158 download = s.get(query) | |
| 159 decoded_content = download.content.decode('utf-8') | |
| 160 uniprot_reviewed_list = decoded_content.splitlines() | |
| 161 | |
| 162 for line in tab[1:]: | |
| 163 UniProtAC = line[0] | |
| 164 if UniProtAC in uniprot_reviewed_list : | |
| 165 line.insert(1,UniProtAC) | |
| 166 else : | |
| 167 line.insert(1,"") | |
| 155 | 168 |
| 156 """ | 169 """ |
| 157 Supplementary ID to get from HUMAN_9606_idmapping.dat : | 170 Supplementary ID to get from HUMAN_9606_idmapping.dat : |
| 158 -NextProt,BioGrid,STRING,KEGG | 171 -NextProt,BioGrid,STRING,KEGG |
| 159 """ | 172 """ |
| 202 else : | 215 else : |
| 203 line.extend(["","",""]) | 216 line.extend(["","",""]) |
| 204 | 217 |
| 205 #print ("tab ok") | 218 #print ("tab ok") |
| 206 | 219 |
| 207 #add missing nextprot ID for human | 220 #add missing nextprot ID for human or replace old ones |
| 208 if human : | 221 if human : |
| 209 #build next_dict | 222 #build next_dict |
| 210 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) | 223 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) |
| 211 next_dict = {} | 224 next_dict = {} |
| 212 for nextid in nextprot_ids : | 225 for nextid in nextprot_ids : |
| 215 | 228 |
| 216 #add missing nextprot ID | 229 #add missing nextprot ID |
| 217 for line in tab[1:] : | 230 for line in tab[1:] : |
| 218 uniprotID=line[0] | 231 uniprotID=line[0] |
| 219 nextprotID=line[13] | 232 nextprotID=line[13] |
| 220 if nextprotID == '' and uniprotID in next_dict : | 233 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : |
| 221 line[13]=next_dict[uniprotID] | 234 line[13]=next_dict[uniprotID] |
| 222 | 235 |
| 223 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" | 236 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" |
| 224 path = os.path.join(target_directory,output_file) | 237 path = os.path.join(target_directory,output_file) |
| 225 | 238 |
| 227 w = csv.writer(out,delimiter='\t') | 240 w = csv.writer(out,delimiter='\t') |
| 228 w.writerows(tab) | 241 w.writerows(tab) |
| 229 | 242 |
| 230 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} | 243 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} |
| 231 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" | 244 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" |
| 232 id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") | 245 id = str(10000000000 - int(time.strftime("%d%m%Y"))) |
| 233 | 246 |
| 234 data_table_entry = dict(id=id, name = name, species = species, value = path) | 247 data_table_entry = dict(id=id, name = name, species = species, value = path) |
| 235 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) | 248 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species) |
| 236 | 249 |
| 237 def download_from_uniprot_ftp(file,target_directory) : | 250 def download_from_uniprot_ftp(file,target_directory) : |
