changeset 0:fcf82328f066 draft default tip

new version. Not tested yet.
author trinity_ctat
date Mon, 22 Jan 2018 16:06:42 -0500
parents
children
files data_manager/add_ctat_ref_lib.py data_manager/add_ctat_ref_lib.xml data_manager/get_ctat_genome_filenames.py tool-data/ctat_genome_ref_libs.loc.sample tool_data_table_conf.xml.sample
diffstat 5 files changed, 334 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/add_ctat_ref_lib.py	Mon Jan 22 16:06:42 2018 -0500
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+# ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
+
+# Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
+# other example code on the web.
+# This now allows downloading of a user selected library
+# but only from the CTAT Genome Reference Library website.
+# Ultimately we might want to allow the user to specify any location 
+# from which to download.
+# Users can create or download other libraries and use this tool to add them if they don't want
+# or don't know how to add them by hand.
+
+import argparse
+import os
+#import tarfile
+#import urllib
+import subprocess
+
+from galaxy.util.json import from_json_string, to_json_string
+
+# The following function is used by the Data Manager interface (.xml file) to get the
+# filenames that are available online at broadinstitute.org
+# Not sure best way to do it. This method parses the html looking for the filenames.
+import urllib2
+from HTMLParser import HTMLParser
+
+class FileListParser(HTMLParser):
+    def __init__(self):
+        # Have to use direct call to super class rather than using super():
+        # super(FileListParser, self).__init__()
+        # because HTMLParser is an "old style" class and its inheritance chain does not include object.
+        HTMLParser.__init__(self)
+        self.filenames = set()
+    def handle_starttag(self, tag, attrs):
+        # Look for filename references in anchor tags and add them to filenames.
+        if tag == "a":
+            # The tag is an anchor tag.
+            for attribute in attrs:
+                # print "Checking: {:s}".format(str(attribute))
+                if attribute[0] == "href":
+                    # Does the href have a tar.gz in it?
+                    if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
+                        # Add the value to filenames.
+                        self.filenames.add(attribute[1])            
+# End of class FileListParser
+
+def get_ctat_genome_filenames():
+    # open the url and retrieve the filenames of the files in the directory.
+    resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/')
+    theHTML = resource.read()
+    filelist_parser = FileListParser()
+    filelist_parser.feed(theHTML)
+    # return a tuple of the filenames
+    return tuple(filelist_parser.filenames)
+
+# The following was used by the example program to get input parameters through the json.
+# Just leaving here for reference.
+#def get_reference_id_name(params):
+#    genome_id = params['param_dict']['genome_id']
+#    genome_name = params['param_dict']['genome_name']
+#    return genome_id, genome_name
+#
+#def get_url(params):
+#    trained_url = params['param_dict']['trained_url']
+#    return trained_url
+
+def download_from_BroadInst(src_filename, destination):
+    # FIX - The name of this function is too narrow now. It does more than download.
+    # Perhaps split function into its pieces and rename.
+    # FIX - need to consider if this is a rerun of a failed processing or download
+    # If the files that would be downloaded exist and are the correct size, we should
+    # skip the download, also in post-processing we should see if the data has been
+    # processed before, and whether the processed files are the correct size?
+    # or do the functions themselves already check if the files are there and skip steps?
+    # Maybe add a field for the user to indicate to ignore/delete previous data and
+    # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory 
+    # and computing resources are needed to generate the indexes.
+    ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename
+    # FIX - Check that the download directory is empty if it exists. 
+    # why does it need to be empty? The downloaded file will be a single directory in that file.
+    # Also, can we check if there is enough space on the device as well?
+    # FIX - Also we want to make sure that destination is absolute fully specified path.
+    cannonical_destination = os.path.realpath(destination)
+    if os.path.exists(cannonical_destination):
+        if not os.path.isdir(cannonical_destination):
+            raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination))
+        # else all is good. It is a directory.
+    else:
+        # We need to create it.
+        os.makedirs(cannonical_destination)
+    # Get the list of files in the directory, so after we extract the archive we can find the one
+    # that was extracted as the file that is not in this list.
+    orig_files_in_destdir = set(os.listdir(cannonical_destination))
+    #Download ref: https://dzone.com/articles/how-download-file-python
+    #f = urllib2.urlopen(ctat_resource_lib)
+    #data = f.read()
+    #with open(filepath, 'wb') as code:
+    #    code.write(data)
+    # another way
+    #full_filepath = os.path.join(destination, src_filename)
+    #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath)
+    # Put the following into a try statement, so that if there is a failure 
+    # something can be printed about it before reraising exception.
+    #tarfile.open(full_filepath, mode='r:*').extractall()
+    # But we want to transfer and untar it without storing the tar file, because that
+    # adds all that much more space to the needed amount of free space.
+    # so use subprocess to pipe the output of curl into tar.
+    command = "curl {:s} | tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination)
+    try: # to run the command that downloads and extracts the file.
+        command_output = subprocess.check_output(command, shell=True)
+    except subprocess.CalledProcessError as e:
+        print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
+        print "================================================"
+        print "\tOutput while running the command was:\n\n{:s}".format(e.output)
+        print "================================================"
+        raise
+    # Get the root filename of the extracted file. It will be the file that was not in the directory
+    # before we did the download and extraction.
+    newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
+    found_filename = None
+    for filename in newfiles_in_destdir:
+        # If things are right there should just be one new file, the directory that was extracted.
+        # But in case there was something that happened on the system that created other files,
+        # the correct file's name should be a substring of the tar file that was downloaded.
+        if filename in src_filename:
+            found_filename = filename
+    if found_filename is not None:
+        ctat_genome_directory = cannonical_destination + "/" + found_filename
+        if len(os.listdir(ctat_genome_directory)) == 1:
+            # Then that one file is a subdirectory that should be the ctat_genome_directory.
+            subdir_filename = os.listdir(ctat_genome_directory)[0]
+            ctat_genome_directory += "/" + subdir_filename
+    else:
+        raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
+                         "\n\t{:s}".format(cannonical_destination))
+
+    # In all downloaded libraries there is additional processing 
+    # that needs to happen for gmap-fusion to work.
+    command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \
+              ctat_genome_directory, ctat_genome_directory)
+    try: # to run the command.
+        command_output = subprocess.check_output(command, shell=True)
+    except subprocess.CalledProcessError as e:
+        print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command)
+        print "================================================"
+        print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
+        print "================================================"
+        raise
+
+    # If the src_filename indicates it is a source file, as opposed to plug-n-play, 
+    # then we need to do additional post processing on it with FusionFilter commands.
+    if src_filename.split(".").contains("source_data"):
+        # The use of conda to install the FusionFilter should make the following commands
+        # available without the need to find out where FusionFilter resides. 
+        # ${FusionFilter_HOME}/prep_genome_lib.pl \
+        #    --genome_fa ref_genome.fa \
+        #    --gtf ref_annot.gtf \
+        #    --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
+        #    --fusion_annot_lib fusion_lib.dat.gz
+        # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl  \
+        #     --pfam_domains PFAM.domtblout.dat.gz \
+        #     --genome_lib_dir ctat_genome_lib_build_dir
+        #
+        # I don't know if we can run the programs without changing to the directory.
+        # The instructions in https://github.com/FusionFilter/FusionFilter/wiki
+        # say to change directory before running the commands.
+        os.chdir(ctat_genome_directory)
+        command = "prep_genome_lib.pl " + \
+                    "--genome_fa ref_genome.fa " + \
+                    "--gtf ref_annot.gtf " + \
+                    "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \
+                    "--fusion_annot_lib fusion_lib.dat.gz"
+        try: # to run the command.
+            command_output = subprocess.check_output(command, shell=True)
+        except subprocess.CalledProcessError as e:
+            print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)
+            print "================================================"
+            print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
+            print "================================================"
+            raise
+        command = "index_pfam_domain_info.pl " + \
+                    "--pfam_domains PFAM.domtblout.dat.gz " + \
+                    "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory)
+        try: # to run the command.
+            command_output = subprocess.check_output(command, shell=True)
+        except subprocess.CalledProcessError as e:
+            print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)
+            print "================================================"
+            print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
+            print "================================================"
+            raise
+    # end of post-processing for source_data files
+    
+    return ctat_genome_directory
+
+def main():
+    #Parse Command Line
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--download', default="", \
+        help='Do not use if you already have a CTAT Resource Library that this program downloads.')
+    parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \
+        help='Is used as the selector text of the entry in the data table.')
+    parser.add_argument('-p', '--destination_path', \
+        help='Full path of the CTAT Resource Library location or destination.')
+    parser.add_argument('-o', '--output_filename', \
+        help='Name of the output file, where the json dictionary will be written.')
+    args = parser.parse_args()
+
+    # All of the input parameters are written by default to the json output file prior to
+    # this program being called.
+    # But I do not get input values from the json file, but rather from command line.
+    # Just leaving the following code as a comment, in case it might be useful to someone later.
+    # The target_directory is the typical location where data managers put their data, but then
+    # the typical model is to then copy it to the final location. With our files taking up so many 
+    # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us
+    # the final location (the destination_path) where the files will be placed (or currently reside).
+    #
+    # params = from_json_string(open(output_filename).read())
+    # target_directory = params['output_data'][0]['extra_files_path']
+    # os.mkdir(target_directory)
+
+    if args.download != "":
+        ctat_genome_resource_lib_path = \
+            download_from_BroadInst(src_filename=args.download, destination=args.destination_path)
+    else:
+        # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there.
+        ctat_genome_resource_lib_path = args.destination_path
+
+    if (args.genome_name is None) or (args.genome_name == ""):
+        genome_name = "GRCh38_gencode_v26"
+    else:
+        genome_name = args.genome_name
+    # Set the table_entry_value to the basename of the directory path minus the extension. 
+    # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing.
+    table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0]
+    data_manager_dict = {}
+    data_manager_dict['data_tables'] = {}
+    data_manager_dict['data_tables']['ctat_genome_ref_libs'] = []
+    data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path)
+    data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry)
+
+    # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
+    # which then puts it into the correct .loc file (I think).
+    open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
+
+if __name__ == "__main__":
+    main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/add_ctat_ref_lib.xml	Mon Jan 22 16:06:42 2018 -0500
@@ -0,0 +1,61 @@
+<tool id="ctat_genome_ref_lib_data_manager" 
+    name="CTAT Genome Reference Library Data Manager" 
+    version="1.0.0" tool_type="manage_data">
+    <description>Retrieve, and/or specify the location of, a CTAT Genome Reference Library. 
+    </description>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+        <requirement type="package" version="0.3.0">gmap-fusion</requirement>
+        <!-- gmap-fusion is required in order to process downloaded libraries 
+             to create all of the required files and indexes. It includes gmap
+             and FusionFilter, programs from both of which are needed.
+        -->
+    </requirements>
+    <command detect_errors="default">
+        <![CDATA[
+        python $__tool_directory__/add_ctat_ref_lib.py 
+            --genome_name "${genome_name}"
+            --destination_path "${destination}" 
+            --output_filename "${out_file}"
+            #if str(${download}) == "true":
+                --download "${filename}"
+            #end if
+        ]]>
+    </command>
+    <inputs>
+        <!--
+        <param name="download" type="boolean" checked="false"
+             truevalue="- -download" falsevalue="" label="Need to Download? (yes/no)" />
+            <param name="download" type="select" label="Need to Download?">
+                <option value="single" selected="true">Single Dataset</option>
+                <option value="paired_collection">Paired Collection</option>
+            <when value="paired_collection">
+                 <param name="fastq_input" format="fastqsanger" type="data_collection" collection_type="paired" label="Select dataset pair" help="Specify paired dataset collection containing paired reads"/>
+            </when>
+        -->
+        <conditional name="download_question">
+            <param name="download" type="boolean" checked="false" label="Need to Download? (yes/no)" />
+            </param>
+            <when value="true">
+                 <param name="filename" type="select" label="Select File" display="radio" 
+                     dynamic_options="get_ctat_genome_filenames()" help="Select a CTAT Genome Reference Library to Download." />
+            </when>
+        </conditional>
+
+        <param name="genome_name" type="text" label="Reference Genome name" />
+        <param name="destination" type="text" label="Local Destination (full path)" />
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <help>
+        Retrieve, and/or specify the location of, a CTAT Genome Reference Library.
+        When download is true, the file retrieved and processed is https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz.
+        Specify the Full Path of the location where the CTAT Reference Library should be placed.
+        You will need approximately 30GB of space for this library.
+        If you already have the library, specify the full path of the location where it exists and leave the download box unchecked.
+        The Reference Genome name may be left empty if downloading. The name will be used as the selector text of the entry in the data table.
+        For more information on CTAT Genome Reference Libraries, see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a>
+    </help>
+    <code file="add_ctat_ref_lib.py"
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/get_ctat_genome_filenames.py	Mon Jan 22 16:06:42 2018 -0500
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+# FIX - NEED TO FINISH WRITING THIS.
+get_ctat_genome_filenames()
+    open the url and retrieve the filenames of the files in the directory.
+    return a tuple of the filenames
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ctat_genome_ref_libs.loc.sample	Mon Jan 22 16:06:42 2018 -0500
@@ -0,0 +1,12 @@
+# This file lists the locations of CTAT Genome Reference Libraries
+# Usually there will only be one library, but it is concievable 
+# that there could be multiple libraries.
+# This file format is as follows
+# (white space characters are TAB characters):
+#
+#<unique_id>    <display_name>  <file_path>
+#
+#ctat_genome_ref_libs.loc could look like:
+#
+#CTAT_RESOURCE_LIB  GRCh38_gencode_v26   /ctat/genome/resource/lib/path
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Jan 22 16:06:42 2018 -0500
@@ -0,0 +1,6 @@
+<tables>
+    <table name="ctat_genome_ref_libs" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ctat_genome_ref_libs.loc" />
+    </table>
+</tables>