changeset 20:29cf75c83618 draft

planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty
author proteore
date Tue, 07 May 2019 08:37:17 -0400
parents ff724e70dae0
children 026177e4ff4b
files data_manager/resource_building.py data_manager/resource_building.xml
diffstat 2 files changed, 21 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/resource_building.py	Mon May 06 05:01:21 2019 -0400
+++ b/data_manager/resource_building.py	Tue May 07 08:37:17 2019 -0400
@@ -137,10 +137,8 @@
     files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
 
     #header
-    if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
-    else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
-
-    #print("header ok")
+    if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+    else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
 
     #get selected.tab and keep only ids of interest
     selected_tab_file=species_dict[species]+"_"+files[0]
@@ -153,6 +151,21 @@
 
     #print("selected_tab ok")
 
+    #get uniprot-AC reviewed
+    query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+species_dict[species]+"&format=list"
+
+    with requests.Session() as s:
+        download = s.get(query)
+        decoded_content = download.content.decode('utf-8')
+        uniprot_reviewed_list = decoded_content.splitlines()
+
+    for line in tab[1:]:
+        UniProtAC = line[0]
+        if UniProtAC in uniprot_reviewed_list :
+            line.insert(1,UniProtAC)
+        else : 
+            line.insert(1,"")
+
     """
     Supplementary ID to get from HUMAN_9606_idmapping.dat :
     -NextProt,BioGrid,STRING,KEGG
@@ -204,7 +217,7 @@
 
     #print ("tab ok")
 
-    #add missing nextprot ID for human
+    #add missing nextprot ID for human or replace old ones
     if human : 
         #build next_dict
         nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
@@ -217,7 +230,7 @@
         for line in tab[1:] : 
             uniprotID=line[0]
             nextprotID=line[13]
-            if nextprotID == '' and uniprotID in next_dict :
+            if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
                 line[13]=next_dict[uniprotID]
 
     output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
@@ -229,7 +242,7 @@
 
     name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
     name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
-    id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+    id = str(10000000000 - int(time.strftime("%d%m%Y")))
 
     data_table_entry = dict(id=id, name = name, species = species, value = path)
     _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
--- a/data_manager/resource_building.xml	Mon May 06 05:01:21 2019 -0400
+++ b/data_manager/resource_building.xml	Tue May 07 08:37:17 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.05.05" tool_type="manage_data">
+<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.05.07" tool_type="manage_data">
 <description>
 to create or update reference files for proteore tools
 </description>