proteore_data_manager: data_manager/resource

comparison data_manager/resource_building.py @ 20:29cf75c83618 draft

planemo upload commit e05ccbf13c33b97ab441e2f8bc4b5bc746a378df-dirty

author	proteore
date	Tue, 07 May 2019 08:37:17 -0400
parents	f75c525e0a4a
children	026177e4ff4b

comparison

equal deleted inserted replaced

-:ff724e70dae0
+:29cf75c83618
 human = species == "Human"
 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
 #header
-if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
-else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
-#print("header ok")
 #get selected.tab and keep only ids of interest
 selected_tab_file=species_dict[species]+"_"+files[0]
 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
 with gzip.open(tab_path,"rt") as select :
 for line in tab_reader :
 tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
 os.remove(tab_path)
 #print("selected_tab ok")
+#get uniprot-AC reviewed
+query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+species_dict[species]+"&format=list"
+with requests.Session() as s:
+download = s.get(query)
+decoded_content = download.content.decode('utf-8')
+uniprot_reviewed_list = decoded_content.splitlines()
+for line in tab[1:]:
+UniProtAC = line[0]
+if UniProtAC in uniprot_reviewed_list :
+line.insert(1,UniProtAC)
+else :
+line.insert(1,"")
 """
 Supplementary ID to get from HUMAN_9606_idmapping.dat :
 -NextProt,BioGrid,STRING,KEGG
 """
 else :
 line.extend(["","",""])
 #print ("tab ok")
-#add missing nextprot ID for human
+#add missing nextprot ID for human or replace old ones
 if human :
 #build next_dict
 nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
 next_dict = {}
 for nextid in nextprot_ids :
 #add missing nextprot ID
 for line in tab[1:] :
 uniprotID=line[0]
 nextprotID=line[13]
-if nextprotID == '' and uniprotID in next_dict :
+if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
 line[13]=next_dict[uniprotID]
 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
 path = os.path.join(target_directory,output_file)
 w = csv.writer(out,delimiter='\t')
 w.writerows(tab)
 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
-id = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
+id = str(10000000000 - int(time.strftime("%d%m%Y")))
 data_table_entry = dict(id=id, name = name, species = species, value = path)
 _add_data_table_entry(data_manager_dict, data_table_entry, "proteore_id_mapping_"+species)
 def download_from_uniprot_ftp(file,target_directory) :

Mercurial > repos > proteore > proteore_data_manager

comparison data_manager/resource_building.py @ 20:29cf75c83618 draft