proteore_data_manager: data_manager/resource

comparison data_manager/resource_building.py @ 30:a6cabd3ab71f draft

"planemo upload commit b89f1921a1759139b452c6fac1ad7ee01b6b633d-dirty"

author	proteore
date	Thu, 12 Dec 2019 09:26:42 +0000
parents	9a40b72414de
children	faeeabb11a4d

comparison

equal deleted inserted replaced

-:9a40b72414de
+:a6cabd3ab71f
 human = species == "Human"
 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
 #header
-if human : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG"]]
+if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
-else : tab = [["UniProt-AC","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG"]]
+else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
 #get selected.tab and keep only ids of interest
 selected_tab_file=species_dict[species]+"_"+files[0]
 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
 with gzip.open(tab_path,"rt") as select :
 tab_reader = csv.reader(select,delimiter="\t")
 for line in tab_reader :
-tab.append([line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
+tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
 os.remove(tab_path)
 #print("selected_tab ok")
 #get uniprot-AC reviewed
 download = s.get(query)
 decoded_content = download.content.decode('utf-8')
 uniprot_reviewed_list = decoded_content.splitlines()
 for line in tab[1:]:
-UniProtAC = line[0]
+UniProtAC = line[1]
 if UniProtAC not in uniprot_reviewed_list :
-line[0]=""
 line[1]=""
+line[2]=""
 """
 Supplementary ID to get from HUMAN_9606_idmapping.dat :
 -NextProt,BioGrid,STRING,KEGG
 """
 #there's more id type for human
-if human : ids = ['neXtProt','BioGrid','STRING','KEGG' ]   #ids to get from dat_file
+if human : ids = ['neXtProt','BioGrid','STRING','KEGG','Gene_Name' ]   #ids to get from dat_file
-else : ids = ['BioGrid','STRING','KEGG' ]
+else : ids = ['BioGrid','STRING','KEGG','Gene_Name' ]
 unidict = {}
 #keep only ids of interest in dictionaries
 dat_file=species_dict[species]+"_"+files[1]
 dat_path = download_from_uniprot_ftp(dat_file,target_directory)
 if human :
 if uniprotID in unidict :
 nextprot = access_dictionary(unidict,uniprotID,'neXtProt')
 if nextprot != '' : nextprot = clean_nextprot_id(nextprot,line[0])
 line.extend([nextprot,access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
-access_dictionary(unidict,uniprotID,'KEGG')])
+access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
 else :
-line.extend(["","","",""])
+line.extend(["","","","",""])
 else :
 if uniprotID in unidict :
 line.extend([access_dictionary(unidict,uniprotID,'BioGrid'),access_dictionary(unidict,uniprotID,'STRING'),
-access_dictionary(unidict,uniprotID,'KEGG')])
+access_dictionary(unidict,uniprotID,'KEGG'),access_dictionary(unidict,uniprotID,'Gene_Name')])
 else :
-line.extend(["","",""])
+line.extend(["","","",""])
 #print ("tab ok")
 #add missing nextprot ID for human or replace old ones
 if human :
 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
 #add missing nextprot ID
 for line in tab[1:] :
 uniprotID=line[0]
-nextprotID=line[13]
+nextprotID=line[14]
 if uniprotID in next_dict and (nextprotID == '' or (nextprotID != "NX_"+uniprotID and next_dict[uniprotID] == "NX_"+uniprotID)) :
-line[13]=next_dict[uniprotID]
+line[14]=next_dict[uniprotID]
 output_file = species+"_id_mapping_"+ time.strftime("%d-%m-%Y") + ".tsv"
 path = os.path.join(target_directory,output_file)
 with open(path,"w") as out :

Mercurial > repos > proteore > proteore_data_manager

comparison data_manager/resource_building.py @ 30:a6cabd3ab71f draft