Mercurial > repos > trinity_ctat > not_used
changeset 0:fcf82328f066 draft default tip
new version. Not tested yet.
author | trinity_ctat |
---|---|
date | Mon, 22 Jan 2018 16:06:42 -0500 |
parents | |
children | |
files | data_manager/add_ctat_ref_lib.py data_manager/add_ctat_ref_lib.xml data_manager/get_ctat_genome_filenames.py tool-data/ctat_genome_ref_libs.loc.sample tool_data_table_conf.xml.sample |
diffstat | 5 files changed, 334 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/add_ctat_ref_lib.py Mon Jan 22 16:06:42 2018 -0500 @@ -0,0 +1,248 @@ +#!/usr/bin/env python +# ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/ + +# Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and +# other example code on the web. +# This now allows downloading of a user selected library +# but only from the CTAT Genome Reference Library website. +# Ultimately we might want to allow the user to specify any location +# from which to download. +# Users can create or download other libraries and use this tool to add them if they don't want +# or don't know how to add them by hand. + +import argparse +import os +#import tarfile +#import urllib +import subprocess + +from galaxy.util.json import from_json_string, to_json_string + +# The following function is used by the Data Manager interface (.xml file) to get the +# filenames that are available online at broadinstitute.org +# Not sure best way to do it. This method parses the html looking for the filenames. +import urllib2 +from HTMLParser import HTMLParser + +class FileListParser(HTMLParser): + def __init__(self): + # Have to use direct call to super class rather than using super(): + # super(FileListParser, self).__init__() + # because HTMLParser is an "old style" class and its inheritance chain does not include object. + HTMLParser.__init__(self) + self.filenames = set() + def handle_starttag(self, tag, attrs): + # Look for filename references in anchor tags and add them to filenames. + if tag == "a": + # The tag is an anchor tag. + for attribute in attrs: + # print "Checking: {:s}".format(str(attribute)) + if attribute[0] == "href": + # Does the href have a tar.gz in it? + if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]): + # Add the value to filenames. + self.filenames.add(attribute[1]) +# End of class FileListParser + +def get_ctat_genome_filenames(): + # open the url and retrieve the filenames of the files in the directory. + resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/') + theHTML = resource.read() + filelist_parser = FileListParser() + filelist_parser.feed(theHTML) + # return a tuple of the filenames + return tuple(filelist_parser.filenames) + +# The following was used by the example program to get input parameters through the json. +# Just leaving here for reference. +#def get_reference_id_name(params): +# genome_id = params['param_dict']['genome_id'] +# genome_name = params['param_dict']['genome_name'] +# return genome_id, genome_name +# +#def get_url(params): +# trained_url = params['param_dict']['trained_url'] +# return trained_url + +def download_from_BroadInst(src_filename, destination): + # FIX - The name of this function is too narrow now. It does more than download. + # Perhaps split function into its pieces and rename. + # FIX - need to consider if this is a rerun of a failed processing or download + # If the files that would be downloaded exist and are the correct size, we should + # skip the download, also in post-processing we should see if the data has been + # processed before, and whether the processed files are the correct size? + # or do the functions themselves already check if the files are there and skip steps? + # Maybe add a field for the user to indicate to ignore/delete previous data and + # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory + # and computing resources are needed to generate the indexes. + ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename + # FIX - Check that the download directory is empty if it exists. + # why does it need to be empty? The downloaded file will be a single directory in that file. + # Also, can we check if there is enough space on the device as well? + # FIX - Also we want to make sure that destination is absolute fully specified path. + cannonical_destination = os.path.realpath(destination) + if os.path.exists(cannonical_destination): + if not os.path.isdir(cannonical_destination): + raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination)) + # else all is good. It is a directory. + else: + # We need to create it. + os.makedirs(cannonical_destination) + # Get the list of files in the directory, so after we extract the archive we can find the one + # that was extracted as the file that is not in this list. + orig_files_in_destdir = set(os.listdir(cannonical_destination)) + #Download ref: https://dzone.com/articles/how-download-file-python + #f = urllib2.urlopen(ctat_resource_lib) + #data = f.read() + #with open(filepath, 'wb') as code: + # code.write(data) + # another way + #full_filepath = os.path.join(destination, src_filename) + #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath) + # Put the following into a try statement, so that if there is a failure + # something can be printed about it before reraising exception. + #tarfile.open(full_filepath, mode='r:*').extractall() + # But we want to transfer and untar it without storing the tar file, because that + # adds all that much more space to the needed amount of free space. + # so use subprocess to pipe the output of curl into tar. + command = "curl {:s} | tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination) + try: # to run the command that downloads and extracts the file. + command_output = subprocess.check_output(command, shell=True) + except subprocess.CalledProcessError as e: + print "ERROR: Trying to run the following command:\n\t{:s}".format(command) + print "================================================" + print "\tOutput while running the command was:\n\n{:s}".format(e.output) + print "================================================" + raise + # Get the root filename of the extracted file. It will be the file that was not in the directory + # before we did the download and extraction. + newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir + found_filename = None + for filename in newfiles_in_destdir: + # If things are right there should just be one new file, the directory that was extracted. + # But in case there was something that happened on the system that created other files, + # the correct file's name should be a substring of the tar file that was downloaded. + if filename in src_filename: + found_filename = filename + if found_filename is not None: + ctat_genome_directory = cannonical_destination + "/" + found_filename + if len(os.listdir(ctat_genome_directory)) == 1: + # Then that one file is a subdirectory that should be the ctat_genome_directory. + subdir_filename = os.listdir(ctat_genome_directory)[0] + ctat_genome_directory += "/" + subdir_filename + else: + raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \ + "\n\t{:s}".format(cannonical_destination)) + + # In all downloaded libraries there is additional processing + # that needs to happen for gmap-fusion to work. + command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \ + ctat_genome_directory, ctat_genome_directory) + try: # to run the command. + command_output = subprocess.check_output(command, shell=True) + except subprocess.CalledProcessError as e: + print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command) + print "================================================" + print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) + print "================================================" + raise + + # If the src_filename indicates it is a source file, as opposed to plug-n-play, + # then we need to do additional post processing on it with FusionFilter commands. + if src_filename.split(".").contains("source_data"): + # The use of conda to install the FusionFilter should make the following commands + # available without the need to find out where FusionFilter resides. + # ${FusionFilter_HOME}/prep_genome_lib.pl \ + # --genome_fa ref_genome.fa \ + # --gtf ref_annot.gtf \ + # --blast_pairs blast_pairs.gene_syms.outfmt6.gz \ + # --fusion_annot_lib fusion_lib.dat.gz + # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl \ + # --pfam_domains PFAM.domtblout.dat.gz \ + # --genome_lib_dir ctat_genome_lib_build_dir + # + # I don't know if we can run the programs without changing to the directory. + # The instructions in https://github.com/FusionFilter/FusionFilter/wiki + # say to change directory before running the commands. + os.chdir(ctat_genome_directory) + command = "prep_genome_lib.pl " + \ + "--genome_fa ref_genome.fa " + \ + "--gtf ref_annot.gtf " + \ + "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \ + "--fusion_annot_lib fusion_lib.dat.gz" + try: # to run the command. + command_output = subprocess.check_output(command, shell=True) + except subprocess.CalledProcessError as e: + print "ERROR: While trying to process the genome library:\n\t{:s}".format(command) + print "================================================" + print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) + print "================================================" + raise + command = "index_pfam_domain_info.pl " + \ + "--pfam_domains PFAM.domtblout.dat.gz " + \ + "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory) + try: # to run the command. + command_output = subprocess.check_output(command, shell=True) + except subprocess.CalledProcessError as e: + print "ERROR: While trying to process the genome library:\n\t{:s}".format(command) + print "================================================" + print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) + print "================================================" + raise + # end of post-processing for source_data files + + return ctat_genome_directory + +def main(): + #Parse Command Line + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--download', default="", \ + help='Do not use if you already have a CTAT Resource Library that this program downloads.') + parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \ + help='Is used as the selector text of the entry in the data table.') + parser.add_argument('-p', '--destination_path', \ + help='Full path of the CTAT Resource Library location or destination.') + parser.add_argument('-o', '--output_filename', \ + help='Name of the output file, where the json dictionary will be written.') + args = parser.parse_args() + + # All of the input parameters are written by default to the json output file prior to + # this program being called. + # But I do not get input values from the json file, but rather from command line. + # Just leaving the following code as a comment, in case it might be useful to someone later. + # The target_directory is the typical location where data managers put their data, but then + # the typical model is to then copy it to the final location. With our files taking up so many + # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us + # the final location (the destination_path) where the files will be placed (or currently reside). + # + # params = from_json_string(open(output_filename).read()) + # target_directory = params['output_data'][0]['extra_files_path'] + # os.mkdir(target_directory) + + if args.download != "": + ctat_genome_resource_lib_path = \ + download_from_BroadInst(src_filename=args.download, destination=args.destination_path) + else: + # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there. + ctat_genome_resource_lib_path = args.destination_path + + if (args.genome_name is None) or (args.genome_name == ""): + genome_name = "GRCh38_gencode_v26" + else: + genome_name = args.genome_name + # Set the table_entry_value to the basename of the directory path minus the extension. + # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing. + table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0] + data_manager_dict = {} + data_manager_dict['data_tables'] = {} + data_manager_dict['data_tables']['ctat_genome_ref_libs'] = [] + data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path) + data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry) + + # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager, + # which then puts it into the correct .loc file (I think). + open(args.output_filename, 'wb').write(to_json_string(data_manager_dict)) + +if __name__ == "__main__": + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/add_ctat_ref_lib.xml Mon Jan 22 16:06:42 2018 -0500 @@ -0,0 +1,61 @@ +<tool id="ctat_genome_ref_lib_data_manager" + name="CTAT Genome Reference Library Data Manager" + version="1.0.0" tool_type="manage_data"> + <description>Retrieve, and/or specify the location of, a CTAT Genome Reference Library. + </description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="0.3.0">gmap-fusion</requirement> + <!-- gmap-fusion is required in order to process downloaded libraries + to create all of the required files and indexes. It includes gmap + and FusionFilter, programs from both of which are needed. + --> + </requirements> + <command detect_errors="default"> + <![CDATA[ + python $__tool_directory__/add_ctat_ref_lib.py + --genome_name "${genome_name}" + --destination_path "${destination}" + --output_filename "${out_file}" + #if str(${download}) == "true": + --download "${filename}" + #end if + ]]> + </command> + <inputs> + <!-- + <param name="download" type="boolean" checked="false" + truevalue="- -download" falsevalue="" label="Need to Download? (yes/no)" /> + <param name="download" type="select" label="Need to Download?"> + <option value="single" selected="true">Single Dataset</option> + <option value="paired_collection">Paired Collection</option> + <when value="paired_collection"> + <param name="fastq_input" format="fastqsanger" type="data_collection" collection_type="paired" label="Select dataset pair" help="Specify paired dataset collection containing paired reads"/> + </when> + --> + <conditional name="download_question"> + <param name="download" type="boolean" checked="false" label="Need to Download? (yes/no)" /> + </param> + <when value="true"> + <param name="filename" type="select" label="Select File" display="radio" + dynamic_options="get_ctat_genome_filenames()" help="Select a CTAT Genome Reference Library to Download." /> + </when> + </conditional> + + <param name="genome_name" type="text" label="Reference Genome name" /> + <param name="destination" type="text" label="Local Destination (full path)" /> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <help> + Retrieve, and/or specify the location of, a CTAT Genome Reference Library. + When download is true, the file retrieved and processed is https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/GRCh38_gencode_v26_CTAT_lib_Nov012017.plug-n-play.tar.gz. + Specify the Full Path of the location where the CTAT Reference Library should be placed. + You will need approximately 30GB of space for this library. + If you already have the library, specify the full path of the location where it exists and leave the download box unchecked. + The Reference Genome name may be left empty if downloading. The name will be used as the selector text of the entry in the data table. + For more information on CTAT Genome Reference Libraries, see <a http="https://github.com/FusionFilter/FusionFilter/wiki">FusionFilter</a> + </help> + <code file="add_ctat_ref_lib.py" +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/get_ctat_genome_filenames.py Mon Jan 22 16:06:42 2018 -0500 @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +# FIX - NEED TO FINISH WRITING THIS. +get_ctat_genome_filenames() + open the url and retrieve the filenames of the files in the directory. + return a tuple of the filenames +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ctat_genome_ref_libs.loc.sample Mon Jan 22 16:06:42 2018 -0500 @@ -0,0 +1,12 @@ +# This file lists the locations of CTAT Genome Reference Libraries +# Usually there will only be one library, but it is concievable +# that there could be multiple libraries. +# This file format is as follows +# (white space characters are TAB characters): +# +#<unique_id> <display_name> <file_path> +# +#ctat_genome_ref_libs.loc could look like: +# +#CTAT_RESOURCE_LIB GRCh38_gencode_v26 /ctat/genome/resource/lib/path +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Jan 22 16:06:42 2018 -0500 @@ -0,0 +1,6 @@ +<tables> + <table name="ctat_genome_ref_libs" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, path</columns> + <file path="tool-data/ctat_genome_ref_libs.loc" /> + </table> +</tables>