Mercurial > repos > proteore > proteore_data_manager

--- a/data_manager/resource_building.py	Thu Dec 12 09:26:42 2019 +0000
+++ b/data_manager/resource_building.py	Wed Jan 08 09:53:34 2020 +0000
@@ -3,7 +3,7 @@
 The purpose of this script is to create source files from different databases to be used in other proteore tools
 """

-import os, sys, argparse, requests, time, csv, re, json, shutil, zipfile
+import os, shutil, sys, argparse, requests, time, csv, re, json, shutil, zipfile
 from io import BytesIO
 from zipfile import ZipFile
 from galaxy.util.json import from_json_string, to_json_string
@@ -131,11 +131,13 @@
 import ftplib, gzip
 csv.field_size_limit(sys.maxsize) # to handle big files

-def id_mapping_sources (data_manager_dict, species, target_directory) :
+def id_mapping_sources (data_manager_dict, species, target_directory, tool_data_path) :

     human = species == "Human"
     species_dict = { "Human" : "HUMAN_9606", "Mouse" : "MOUSE_10090", "Rat" : "RAT_10116" }
     files=["idmapping_selected.tab.gz","idmapping.dat.gz"]
+    archive = os.path.join(tool_data_path, "ID_mapping_archive_"+species+"_"+str(time.strftime("%Y%m%d")))
+    os.mkdir(archive)

     #header
     if human : tab = [["UniProt-AC","UniProt-AC_reviewed","UniProt-ID","GeneID","RefSeq","GI","PDB","GO","PIR","MIM","UniGene","Ensembl_Gene","Ensembl_Transcript","Ensembl_Protein","neXtProt","BioGrid","STRING","KEGG",'Gene_Name']]
@@ -148,8 +150,7 @@
         tab_reader = csv.reader(select,delimiter="\t")
         for line in tab_reader :
             tab.append([line[0]]+[line[i] for i in [0,1,2,3,4,5,6,11,13,14,18,19,20]])
-    os.remove(tab_path)
-
+    shutil.move(tab_path, archive)
     #print("selected_tab ok")

     #get uniprot-AC reviewed
@@ -161,11 +162,18 @@
         decoded_content = download.content.decode('utf-8')
         uniprot_reviewed_list = decoded_content.splitlines()

+    #save reviewed list
+    reviewed_list_path = os.path.join(archive,'uniprot_reviewed_list.txt')
+    with open(reviewed_list_path,w) as reviewed_list_file:
+        for id in uniprot_reviewed_list:
+            reviewed_list_file.write(id+"\n")
+    shutil.move(reviewed_list_path, archive)
+
+    #remove unreviewed uniprot-AC
     for line in tab[1:]:
         UniProtAC = line[1]
         if UniProtAC not in uniprot_reviewed_list :
             line[1]=""
-            line[2]=""

     """
     Supplementary ID to get from HUMAN_9606_idmapping.dat :
@@ -194,7 +202,7 @@
                         unidict[uniprotID].update({ id_type : cor_id })
                 elif  id_type in ids :
                     unidict[uniprotID]={id_type : cor_id}
-    os.remove(dat_path)
+    shutil.move(dat_path, archive)

     #print("dat_file ok")

@@ -221,7 +229,10 @@
     #add missing nextprot ID for human or replace old ones
     if human :
         #build next_dict
-        nextprot_ids = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+        nextprot_path = id_list_from_nextprot_ftp("nextprot_ac_list_all.txt",target_directory)
+        with open(nextprot_path,'r') as nextprot_ids :
+            nextprot_ids = nextprot_ids.read().splitlines()
+        shutil.move(nextprot_path,archive)
         next_dict = {}
         for nextid in nextprot_ids :
             next_dict[nextid.replace("NX_","")] = nextid
@@ -240,6 +251,8 @@
     with open(path,"w") as out :
         w = csv.writer(out,delimiter='\t')
         w.writerows(tab)
+
+    subprocess.call(['tar', '-zczf', archive+".tar.gz", archive])

     name_dict={"Human" : "Homo sapiens", "Mouse" : "Mus musculus", "Rat" : "Rattus norvegicus"}
     name = species +" (" + name_dict[species]+" "+time.strftime("%d/%m/%Y")+")"
@@ -267,9 +280,8 @@
     ftp.cwd(ftp_dir)
     ftp.retrbinary("RETR " + file, open(path, 'wb').write)
     ftp.quit()
-    with open(path,'r') as nextprot_ids :
-        nextprot_ids = nextprot_ids.read().splitlines()
-    return (nextprot_ids)
+
+    return (path)

 #return '' if there's no value in a dictionary, avoid error
 def access_dictionary (dico,key1,key2) :
@@ -597,6 +609,7 @@
     parser.add_argument("--date")
     parser.add_argument("-o", "--output")
     parser.add_argument("--database")
+    parser.add_argument("--tool_data_path")
     args = parser.parse_args()

     data_manager_dict = {}
@@ -627,7 +640,7 @@
         #target_directory = "/projet/galaxydev/galaxy/tools/proteore/ProteoRE/tools/resources_building/test-data/"
         peptide_atlas = peptide_atlas.split(",")
         for pa_tissue in peptide_atlas:
-            peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory)
+            peptide_atlas_sources(data_manager_dict, pa_tissue, date, target_directory, args.tool_data_path)

     ## Download ID_mapping source file from Uniprot
     try:
--- a/data_manager/resource_building.xml	Thu Dec 12 09:26:42 2019 +0000
+++ b/data_manager/resource_building.xml	Wed Jan 08 09:53:34 2020 +0000
@@ -1,4 +1,4 @@
-<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2019.12.12" tool_type="manage_data">
+<tool id="data_manager_proteore" name="Get source files for proteore tools" version="2020.01.08" tool_type="manage_data">
 <description>
 to create or update reference files for proteore tools
 </description>
@@ -27,6 +27,7 @@
         --database=$database.database
     #end if
     --output "$output"
+    --tool_data_path=$__tool_data_path__

 ]]></command>

@@ -43,7 +44,7 @@
             <param name="tissues" type="select" multiple="false" label="Please select tissue">
                 <option value="HPA_normal_tissue">Normal tissue</option>
                 <option value="HPA_pathology">Pathology</option>
-                <!--option value="HPA_full_atlas">Full Atlas</option-->
+                <option value="HPA_full_atlas">Full Atlas</option>
             </param>
         </when>
         <when value="peptide_atlas">