# HG changeset patch # User proteore # Date 1578477214 0 # Node ID faeeabb11a4dc396c09a6b32e425188fa059bc36 # Parent a6cabd3ab71f3b4863af63da1ff49f40b64a8f46 "planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty" diff -r a6cabd3ab71f -r faeeabb11a4d data_manager/resource_building.py --- a/data_manager/resource_building.py Thu Dec 12 09:26:42 2019 +0000 +++ b/data_manager/resource_building.py Wed Jan 08 09:53:34 2020 +0000 @@ -3,7 +3,7 @@ The purpose of this script is to create source files from different databases to be used in other proteore tools """ -import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile +import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile from io import BytesIO from zipfile import ZipFile from galaxy.util.json import from_json_string, to_json_string @@ -131,11 +131,13 @@ import ftplib, gzip csv.field_size_limit(sys.maxsize) # to handle big files -def id_mapping_sources (data_manager_dict, species, target_directory) : +def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : human = species == "Human" species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } files=["idmapping_selected.tab.gz","idmapping.dat.gz"] + archive = os.path.join(tool_data_path, "ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d"))) + os.mkdir(archive) #header if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] @@ -148,8 +150,7 @@ tab_reader = csv.reader(select,delimiter="\t") for line in tab_reader : tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) - os.remove(tab_path) - + shutil.move(tab_path, archive) #print("selected_tab ok") #get uniprot-AC reviewed @@ -161,11 +162,18 @@ decoded_content = download.content.decode('utf-8') uniprot_reviewed_list = decoded_content.splitlines() + #save reviewed list + reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt') + with open(reviewed_list_path,w) as reviewed_list_file: + for id in uniprot_reviewed_list: + reviewed_list_file.write(id+"\n") + shutil.move(reviewed_list_path, archive) + + #remove unreviewed uniprot-AC for line in tab[1:]: UniProtAC = line[1] if UniProtAC not in uniprot_reviewed_list : line[1]="" - line[2]="" """ Supplementary ID to get from HUMAN_9606_idmapping.dat : @@ -194,7 +202,7 @@ unidict[uniprotID].update({ id_type : cor_id }) elif id_type in ids : unidict[uniprotID]={id_type : cor_id} - os.remove(dat_path) + shutil.move(dat_path, archive) #print("dat_file ok") @@ -221,7 +229,10 @@ #add missing nextprot ID for human or replace old ones if human : #build next_dict - nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + with open(nextprot_path,'r') as nextprot_ids : + nextprot_ids = nextprot_ids.read().splitlines() + shutil.move(nextprot_path,archive) next_dict = {} for nextid in nextprot_ids : next_dict[nextid.replace("NX_","")] = nextid @@ -240,6 +251,8 @@ with open(path,"w") as out : w = csv.writer(out,delimiter='\t') w.writerows(tab) + + subprocess.call(['tar', '-zczf', archive+".tar.gz", archive]) name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" @@ -267,9 +280,8 @@ ftp.cwd(ftp_dir) ftp.retrbinary("RETR " + file, open(path, 'wb').write) ftp.quit() - with open(path,'r') as nextprot_ids : - nextprot_ids = nextprot_ids.read().splitlines() - return (nextprot_ids) + + return (path) #return '' if there's no value in a dictionary, avoid error def access_dictionary (dico,key1,key2) : @@ -597,6 +609,7 @@ parser.add_argument("--date") parser.add_argument("-o", "--output") parser.add_argument("--database") + parser.add_argument("--tool_data_path") args = parser.parse_args() data_manager_dict = {} @@ -627,7 +640,7 @@ #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" peptide_atlas = peptide_atlas.split(",") for pa_tissue in peptide_atlas: - peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) + peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory, args.tool_data_path) ## Download ID_mapping source file from Uniprot try: diff -r a6cabd3ab71f -r faeeabb11a4d data_manager/resource_building.xml --- a/data_manager/resource_building.xml Thu Dec 12 09:26:42 2019 +0000 +++ b/data_manager/resource_building.xml Wed Jan 08 09:53:34 2020 +0000 @@ -1,4 +1,4 @@ - + to create or update reference files for proteore tools @@ -27,6 +27,7 @@ --database=$database.database #end if --output "$output" + --tool_data_path=$__tool_data_path__ ]]> @@ -43,7 +44,7 @@ - +