Mercurial > repos > proteore > proteore_data_manager
changeset 20:29cf75c83618 draft
planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
| author | proteore |
|---|---|
| date | Tue, 07 May 2019 08:37:17 -0400 |
| parents | ff724e70dae0 |
| children | 026177e4ff4b |
| files | data_manager/resource_building.py data_manager/resource_building.xml |
| diffstat | 2 files changed, 21 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/resource_building.py Mon May 06 05:01:21 2019 -0400 +++ b/data_manager/resource_building.py Tue May 07 08:37:17 2019 -0400 @@ -137,10 +137,8 @@ files=["idmapping_selected.tab.gz","idmapping.dat.gz"] #header - if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] - else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] - - #print("header ok") + if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]] + else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]] #get selected.tab and keep only ids of interest selected_tab_file=species_dict[species]+"_"+files[0] @@ -153,6 +151,21 @@ #print("selected_tab ok") + #get uniprot-AC reviewed + query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+species_dict[species]+"&format=list" + + with requests.Session() as s: + download = s.get(query) + decoded_content = download.content.decode('utf-8') + uniprot_reviewed_list = decoded_content.splitlines() + + for line in tab[1:]: + UniProtAC = line[0] + if UniProtAC in uniprot_reviewed_list : + line.insert(1,UniProtAC) + else : + line.insert(1,"") + """ Supplementary ID to get from HUMAN_9606_idmapping.dat : -NextProt,BioGrid,STRING,KEGG @@ -204,7 +217,7 @@ #print ("tab ok") - #add missing nextprot ID for human + #add missing nextprot ID for human or replace old ones if human : #build next_dict nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) @@ -217,7 +230,7 @@ for line in tab[1:] : uniprotID=line[0] nextprotID=line[13] - if nextprotID == '' and uniprotID in next_dict : + if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) : line[13]=next_dict[uniprotID] output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv" @@ -229,7 +242,7 @@ name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" - id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + id = str(10000000000 - int(time.strftime("%d%m%Y"))) data_table_entry = dict(id=id, name = name, species = species, value = path) _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
--- a/data_manager/resource_building.xml Mon May 06 05:01:21 2019 -0400 +++ b/data_manager/resource_building.xml Tue May 07 08:37:17 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.05.05" tool_type="manage_data"> +<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.05.07" tool_type="manage_data"> <description> to create or update reference files for proteore tools </description>
