proteore_data_manager: data_manager/resource

comparison data_manager/resource_building.py @ 31:faeeabb11a4d draft

"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"

author	proteore
date	Wed, 08 Jan 2020 09:53:34 +0000
parents	a6cabd3ab71f
children	ec1febc6672e

comparison

equal deleted inserted replaced

-:a6cabd3ab71f
+:faeeabb11a4d
 # -*- coding: utf-8 -*-
 """
 The purpose of this script is to create source files from different databases to be used in other proteore tools
 """
-import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile
+import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile
 from io import BytesIO
 from zipfile import ZipFile
 from galaxy.util.json import from_json_string, to_json_string
 #######################################################################################################
 # 3. ID mapping file
 #######################################################################################################
 import ftplib, gzip
 csv.field_size_limit(sys.maxsize) # to handle big files
-def id_mapping_sources (data_manager_dict, species, target_directory) :
+def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :
 human = species == "Human"
 species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
 files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+archive = os.path.join(tool_data_path, "ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))
+os.mkdir(archive)
 #header
 if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
 else : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","BioGrid","STRING","KEGG",'Gene_Name']]
 tab_path = download_from_uniprot_ftp(selected_tab_file,target_directory)
 with gzip.open(tab_path,"rt") as select :
 tab_reader = csv.reader(select,delimiter="\t")
 for line in tab_reader :
 tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
-os.remove(tab_path)
+shutil.move(tab_path, archive)
 #print("selected_tab ok")
 #get uniprot-AC reviewed
 organism = species_dict[species].split("_")[1]
 query = "https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:"+organism+"&format=list"
 with requests.Session() as s:
 download = s.get(query)
 decoded_content = download.content.decode('utf-8')
 uniprot_reviewed_list = decoded_content.splitlines()
+#save reviewed list
+reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt')
+with open(reviewed_list_path,w) as reviewed_list_file:
+for id in uniprot_reviewed_list:
+reviewed_list_file.write(id+"\n")
+shutil.move(reviewed_list_path, archive)
+#remove unreviewed uniprot-AC
 for line in tab[1:]:
 UniProtAC = line[1]
 if UniProtAC not in uniprot_reviewed_list :
 line[1]=""
-line[2]=""
 """
 Supplementary ID to get from HUMAN_9606_idmapping.dat :
 -NextProt,BioGrid,STRING,KEGG
 """
 unidict[uniprotID][id_type]= ";".join([unidict[uniprotID][id_type],cor_id])    #if there is already a value in the dictionnary
 else :
 unidict[uniprotID].update({ id_type : cor_id })
 elif  id_type in ids :
 unidict[uniprotID]={id_type : cor_id}
-os.remove(dat_path)
+shutil.move(dat_path, archive)
 #print("dat_file ok")
 #add ids from idmapping.dat to the final tab
 for line in tab[1:] :
 #print ("tab ok")
 #add missing nextprot ID for human or replace old ones
 if human :
 #build next_dict
-nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+with open(nextprot_path,'r') as nextprot_ids :
+nextprot_ids = nextprot_ids.read().splitlines()
+shutil.move(nextprot_path,archive)
 next_dict = {}
 for nextid in nextprot_ids :
 next_dict[nextid.replace("NX_","")] = nextid
 os.remove(os.path.join(target_directory,"nextprot_ac_list_all.txt"))
 path = os.path.join(target_directory,output_file)
 with open(path,"w") as out :
 w = csv.writer(out,delimiter='\t')
 w.writerows(tab)
+subprocess.call(['tar', '-zczf', archive+".tar.gz", archive])
 name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
 name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
 release = species+"_id_mapping_"+ time.strftime("%d-%m-%Y")
 id = str(10000000000 - int(time.strftime("%Y%m%d")))    #new ids must be inferior to previous id -> sort by <filter> in xml only in descending order
 ftp = ftplib.FTP("ftp.nextprot.org")
 ftp.login("anonymous", "anonymous")
 ftp.cwd(ftp_dir)
 ftp.retrbinary("RETR " + file, open(path, 'wb').write)
 ftp.quit()
-with open(path,'r') as nextprot_ids :
-nextprot_ids = nextprot_ids.read().splitlines()
+return (path)
-return (nextprot_ids)
 #return '' if there's no value in a dictionary, avoid error
 def access_dictionary (dico,key1,key2) :
 if key1 in dico :
 if key2 in dico[key1] :
 parser.add_argument("--interactome", metavar = ("PPI"))
 parser.add_argument("--species")
 parser.add_argument("--date")
 parser.add_argument("-o", "--output")
 parser.add_argument("--database")
+parser.add_argument("--tool_data_path")
 args = parser.parse_args()
 data_manager_dict = {}
 # Extract json file params
 filename = args.output
 peptide_atlas = None
 if peptide_atlas is not None:
 #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
 peptide_atlas = peptide_atlas.split(",")
 for pa_tissue in peptide_atlas:
-peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
+peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory, args.tool_data_path)
 ## Download ID_mapping source file from Uniprot
 try:
 id_mapping=args.id_mapping
 except NameError:

Mercurial > repos > proteore > proteore_data_manager

comparison data_manager/resource_building.py @ 31:faeeabb11a4d draft