Mercurial > repos > proteore > proteore_data_manager
changeset 31:faeeabb11a4d draft
"planemo upload commit ba867b8fa3352695fbda1ae764407f363ee79a50-dirty"
| author | proteore |
|---|---|
| date | Wed, 08 Jan 2020 09:53:34 +0000 |
| parents | a6cabd3ab71f |
| children | ec1febc6672e |
| files | data_manager/resource_building.py data_manager/resource_building.xml |
| diffstat | 2 files changed, 27 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/resource_building.py Thu Dec 12 09:26:42 2019 +0000 +++ b/data_manager/resource_building.py Wed Jan 08 09:53:34 2020 +0000 @@ -3,7 +3,7 @@ The purpose of this script is to create source files from different databases to be used in other proteore tools """ -import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile +import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile from io import BytesIO from zipfile import ZipFile from galaxy.util.json import from_json_string, to_json_string @@ -131,11 +131,13 @@ import ftplib, gzip csv.field_size_limit(sys.maxsize) # to handle big files -def id_mapping_sources (data_manager_dict, species, target_directory) : +def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) : human = species == "Human" species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" } files=["idmapping_selected.tab.gz","idmapping.dat.gz"] + archive = os.path.join(tool_data_path, "ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d"))) + os.mkdir(archive) #header if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']] @@ -148,8 +150,7 @@ tab_reader = csv.reader(select,delimiter="\t") for line in tab_reader : tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]]) - os.remove(tab_path) - + shutil.move(tab_path, archive) #print("selected_tab ok") #get uniprot-AC reviewed @@ -161,11 +162,18 @@ decoded_content = download.content.decode('utf-8') uniprot_reviewed_list = decoded_content.splitlines() + #save reviewed list + reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt') + with open(reviewed_list_path,w) as reviewed_list_file: + for id in uniprot_reviewed_list: + reviewed_list_file.write(id+"\n") + shutil.move(reviewed_list_path, archive) + + #remove unreviewed uniprot-AC for line in tab[1:]: UniProtAC = line[1] if UniProtAC not in uniprot_reviewed_list : line[1]="" - line[2]="" """ Supplementary ID to get from HUMAN_9606_idmapping.dat : @@ -194,7 +202,7 @@ unidict[uniprotID].update({ id_type : cor_id }) elif id_type in ids : unidict[uniprotID]={id_type : cor_id} - os.remove(dat_path) + shutil.move(dat_path, archive) #print("dat_file ok") @@ -221,7 +229,10 @@ #add missing nextprot ID for human or replace old ones if human : #build next_dict - nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory) + with open(nextprot_path,'r') as nextprot_ids : + nextprot_ids = nextprot_ids.read().splitlines() + shutil.move(nextprot_path,archive) next_dict = {} for nextid in nextprot_ids : next_dict[nextid.replace("NX_","")] = nextid @@ -240,6 +251,8 @@ with open(path,"w") as out : w = csv.writer(out,delimiter='\t') w.writerows(tab) + + subprocess.call(['tar', '-zczf', archive+".tar.gz", archive]) name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"} name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")" @@ -267,9 +280,8 @@ ftp.cwd(ftp_dir) ftp.retrbinary("RETR " + file, open(path, 'wb').write) ftp.quit() - with open(path,'r') as nextprot_ids : - nextprot_ids = nextprot_ids.read().splitlines() - return (nextprot_ids) + + return (path) #return '' if there's no value in a dictionary, avoid error def access_dictionary (dico,key1,key2) : @@ -597,6 +609,7 @@ parser.add_argument("--date") parser.add_argument("-o", "--output") parser.add_argument("--database") + parser.add_argument("--tool_data_path") args = parser.parse_args() data_manager_dict = {} @@ -627,7 +640,7 @@ #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/" peptide_atlas = peptide_atlas.split(",") for pa_tissue in peptide_atlas: - peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory) + peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory, args.tool_data_path) ## Download ID_mapping source file from Uniprot try:
--- a/data_manager/resource_building.xml Thu Dec 12 09:26:42 2019 +0000 +++ b/data_manager/resource_building.xml Wed Jan 08 09:53:34 2020 +0000 @@ -1,4 +1,4 @@ -<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.12.12" tool_type="manage_data"> +<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2020.01.08" tool_type="manage_data"> <description> to create or update reference files for proteore tools </description> @@ -27,6 +27,7 @@ --database=$database.database #end if --output "$output" + --tool_data_path=$__tool_data_path__ ]]></command> @@ -43,7 +44,7 @@ <param name="tissues" type="select" multiple="false" label="Please select tissue"> <option value="HPA_normal_tissue">Normal tissue</option> <option value="HPA_pathology">Pathology</option> - <!--option value="HPA_full_atlas">Full Atlas</option--> + <option value="HPA_full_atlas">Full Atlas</option> </param> </when> <when value="peptide_atlas">
