not_used: data_manager/add_ctat_ref

author	trinity_ctat
date	Mon, 22 Jan 2018 16:06:42 -0500
parents
children

rev	line source
0 fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	1 #!/usr/bin/env python
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	3
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	5 # other example code on the web.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	6 # This now allows downloading of a user selected library
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	7 # but only from the CTAT Genome Reference Library website.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	8 # Ultimately we might want to allow the user to specify any location
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	9 # from which to download.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	10 # Users can create or download other libraries and use this tool to add them if they don't want
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	11 # or don't know how to add them by hand.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	12
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	13 import argparse
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	14 import os
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	15 #import tarfile
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	16 #import urllib
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	17 import subprocess
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	18
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	19 from galaxy.util.json import from_json_string, to_json_string
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	20
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	21 # The following function is used by the Data Manager interface (.xml file) to get the
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	22 # filenames that are available online at broadinstitute.org
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	23 # Not sure best way to do it. This method parses the html looking for the filenames.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	24 import urllib2
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	25 from HTMLParser import HTMLParser
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	26
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	27 class FileListParser(HTMLParser):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	28 def __init__(self):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	29 # Have to use direct call to super class rather than using super():
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	30 # super(FileListParser, self).__init__()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	31 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	32 HTMLParser.__init__(self)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	33 self.filenames = set()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	34 def handle_starttag(self, tag, attrs):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	35 # Look for filename references in anchor tags and add them to filenames.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	36 if tag == "a":
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	37 # The tag is an anchor tag.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	38 for attribute in attrs:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	39 # print "Checking: {:s}".format(str(attribute))
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	40 if attribute[0] == "href":
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	41 # Does the href have a tar.gz in it?
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	42 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	43 # Add the value to filenames.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	44 self.filenames.add(attribute[1])
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	45 # End of class FileListParser
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	46
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	47 def get_ctat_genome_filenames():
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	48 # open the url and retrieve the filenames of the files in the directory.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	49 resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/')
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	50 theHTML = resource.read()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	51 filelist_parser = FileListParser()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	52 filelist_parser.feed(theHTML)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	53 # return a tuple of the filenames
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	54 return tuple(filelist_parser.filenames)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	55
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	56 # The following was used by the example program to get input parameters through the json.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	57 # Just leaving here for reference.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	58 #def get_reference_id_name(params):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	59 # genome_id = params['param_dict']['genome_id']
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	60 # genome_name = params['param_dict']['genome_name']
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	61 # return genome_id, genome_name
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	62 #
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	63 #def get_url(params):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	64 # trained_url = params['param_dict']['trained_url']
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	65 # return trained_url
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	66
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	67 def download_from_BroadInst(src_filename, destination):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	68 # FIX - The name of this function is too narrow now. It does more than download.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	69 # Perhaps split function into its pieces and rename.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	70 # FIX - need to consider if this is a rerun of a failed processing or download
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	71 # If the files that would be downloaded exist and are the correct size, we should
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	72 # skip the download, also in post-processing we should see if the data has been
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	73 # processed before, and whether the processed files are the correct size?
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	74 # or do the functions themselves already check if the files are there and skip steps?
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	75 # Maybe add a field for the user to indicate to ignore/delete previous data and
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	76 # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	77 # and computing resources are needed to generate the indexes.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	78 ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	79 # FIX - Check that the download directory is empty if it exists.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	80 # why does it need to be empty? The downloaded file will be a single directory in that file.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	81 # Also, can we check if there is enough space on the device as well?
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	82 # FIX - Also we want to make sure that destination is absolute fully specified path.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	83 cannonical_destination = os.path.realpath(destination)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	84 if os.path.exists(cannonical_destination):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	85 if not os.path.isdir(cannonical_destination):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	86 raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination))
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	87 # else all is good. It is a directory.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	88 else:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	89 # We need to create it.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	90 os.makedirs(cannonical_destination)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	91 # Get the list of files in the directory, so after we extract the archive we can find the one
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	92 # that was extracted as the file that is not in this list.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	93 orig_files_in_destdir = set(os.listdir(cannonical_destination))
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	94 #Download ref: https://dzone.com/articles/how-download-file-python
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	95 #f = urllib2.urlopen(ctat_resource_lib)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	96 #data = f.read()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	97 #with open(filepath, 'wb') as code:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	98 # code.write(data)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	99 # another way
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	100 #full_filepath = os.path.join(destination, src_filename)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	101 #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	102 # Put the following into a try statement, so that if there is a failure
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	103 # something can be printed about it before reraising exception.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	104 #tarfile.open(full_filepath, mode='r:*').extractall()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	105 # But we want to transfer and untar it without storing the tar file, because that
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	106 # adds all that much more space to the needed amount of free space.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	107 # so use subprocess to pipe the output of curl into tar.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	108 command = "curl {:s} \| tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	109 try: # to run the command that downloads and extracts the file.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	110 command_output = subprocess.check_output(command, shell=True)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	111 except subprocess.CalledProcessError as e:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	112 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	113 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	114 print "\tOutput while running the command was:\n\n{:s}".format(e.output)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	115 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	116 raise
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	117 # Get the root filename of the extracted file. It will be the file that was not in the directory
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	118 # before we did the download and extraction.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	119 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	120 found_filename = None
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	121 for filename in newfiles_in_destdir:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	122 # If things are right there should just be one new file, the directory that was extracted.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	123 # But in case there was something that happened on the system that created other files,
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	124 # the correct file's name should be a substring of the tar file that was downloaded.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	125 if filename in src_filename:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	126 found_filename = filename
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	127 if found_filename is not None:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	128 ctat_genome_directory = cannonical_destination + "/" + found_filename
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	129 if len(os.listdir(ctat_genome_directory)) == 1:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	130 # Then that one file is a subdirectory that should be the ctat_genome_directory.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	131 subdir_filename = os.listdir(ctat_genome_directory)[0]
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	132 ctat_genome_directory += "/" + subdir_filename
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	133 else:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	134 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	135 "\n\t{:s}".format(cannonical_destination))
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	136
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	137 # In all downloaded libraries there is additional processing
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	138 # that needs to happen for gmap-fusion to work.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	139 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	140 ctat_genome_directory, ctat_genome_directory)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	141 try: # to run the command.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	142 command_output = subprocess.check_output(command, shell=True)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	143 except subprocess.CalledProcessError as e:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	144 print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	145 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	146 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	147 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	148 raise
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	149
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	150 # If the src_filename indicates it is a source file, as opposed to plug-n-play,
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	151 # then we need to do additional post processing on it with FusionFilter commands.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	152 if src_filename.split(".").contains("source_data"):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	153 # The use of conda to install the FusionFilter should make the following commands
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	154 # available without the need to find out where FusionFilter resides.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	155 # ${FusionFilter_HOME}/prep_genome_lib.pl \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	156 # --genome_fa ref_genome.fa \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	157 # --gtf ref_annot.gtf \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	158 # --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	159 # --fusion_annot_lib fusion_lib.dat.gz
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	160 # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	161 # --pfam_domains PFAM.domtblout.dat.gz \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	162 # --genome_lib_dir ctat_genome_lib_build_dir
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	163 #
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	164 # I don't know if we can run the programs without changing to the directory.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	165 # The instructions in https://github.com/FusionFilter/FusionFilter/wiki
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	166 # say to change directory before running the commands.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	167 os.chdir(ctat_genome_directory)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	168 command = "prep_genome_lib.pl " + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	169 "--genome_fa ref_genome.fa " + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	170 "--gtf ref_annot.gtf " + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	171 "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	172 "--fusion_annot_lib fusion_lib.dat.gz"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	173 try: # to run the command.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	174 command_output = subprocess.check_output(command, shell=True)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	175 except subprocess.CalledProcessError as e:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	176 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	177 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	178 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	179 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	180 raise
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	181 command = "index_pfam_domain_info.pl " + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	182 "--pfam_domains PFAM.domtblout.dat.gz " + \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	183 "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	184 try: # to run the command.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	185 command_output = subprocess.check_output(command, shell=True)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	186 except subprocess.CalledProcessError as e:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	187 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	188 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	189 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	190 print "================================================"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	191 raise
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	192 # end of post-processing for source_data files
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	193
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	194 return ctat_genome_directory
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	195
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	196 def main():
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	197 #Parse Command Line
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	198 parser = argparse.ArgumentParser()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	199 parser.add_argument('-d', '--download', default="", \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	200 help='Do not use if you already have a CTAT Resource Library that this program downloads.')
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	201 parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	202 help='Is used as the selector text of the entry in the data table.')
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	203 parser.add_argument('-p', '--destination_path', \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	204 help='Full path of the CTAT Resource Library location or destination.')
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	205 parser.add_argument('-o', '--output_filename', \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	206 help='Name of the output file, where the json dictionary will be written.')
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	207 args = parser.parse_args()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	208
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	209 # All of the input parameters are written by default to the json output file prior to
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	210 # this program being called.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	211 # But I do not get input values from the json file, but rather from command line.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	212 # Just leaving the following code as a comment, in case it might be useful to someone later.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	213 # The target_directory is the typical location where data managers put their data, but then
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	214 # the typical model is to then copy it to the final location. With our files taking up so many
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	215 # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	216 # the final location (the destination_path) where the files will be placed (or currently reside).
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	217 #
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	218 # params = from_json_string(open(output_filename).read())
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	219 # target_directory = params['output_data'][0]['extra_files_path']
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	220 # os.mkdir(target_directory)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	221
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	222 if args.download != "":
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	223 ctat_genome_resource_lib_path = \
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	224 download_from_BroadInst(src_filename=args.download, destination=args.destination_path)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	225 else:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	226 # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	227 ctat_genome_resource_lib_path = args.destination_path
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	228
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	229 if (args.genome_name is None) or (args.genome_name == ""):
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	230 genome_name = "GRCh38_gencode_v26"
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	231 else:
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	232 genome_name = args.genome_name
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	233 # Set the table_entry_value to the basename of the directory path minus the extension.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	234 # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing.
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	235 table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0]
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	236 data_manager_dict = {}
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	237 data_manager_dict['data_tables'] = {}
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	238 data_manager_dict['data_tables']['ctat_genome_ref_libs'] = []
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	239 data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	240 data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry)
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	241
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	242 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	243 # which then puts it into the correct .loc file (I think).
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	244 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	245
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	246 if __name__ == "__main__":
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	247 main()
fcf82328f066 new version. Not tested yet. trinity_ctat parents: diff changeset	248

0

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

1 #!/usr/bin/env python

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

3

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

5 # other example code on the web.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

6 # This now allows downloading of a user selected library

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

7 # but only from the CTAT Genome Reference Library website.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

8 # Ultimately we might want to allow the user to specify any location

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

9 # from which to download.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

10 # Users can create or download other libraries and use this tool to add them if they don't want

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

11 # or don't know how to add them by hand.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

12

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

13 import argparse

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

14 import os

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

15 #import tarfile

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

16 #import urllib

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

17 import subprocess

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

18

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

19 from galaxy.util.json import from_json_string, to_json_string

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

20

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

21 # The following function is used by the Data Manager interface (.xml file) to get the

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

22 # filenames that are available online at broadinstitute.org

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

23 # Not sure best way to do it. This method parses the html looking for the filenames.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

24 import urllib2

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

25 from HTMLParser import HTMLParser

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

26

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

27 class FileListParser(HTMLParser):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

28 def __init__(self):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

29 # Have to use direct call to super class rather than using super():

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

30 # super(FileListParser, self).__init__()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

31 # because HTMLParser is an "old style" class and its inheritance chain does not include object.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

32 HTMLParser.__init__(self)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

33 self.filenames = set()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

34 def handle_starttag(self, tag, attrs):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

35 # Look for filename references in anchor tags and add them to filenames.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

36 if tag == "a":

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

37 # The tag is an anchor tag.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

38 for attribute in attrs:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

39 # print "Checking: {:s}".format(str(attribute))

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

40 if attribute[0] == "href":

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

41 # Does the href have a tar.gz in it?

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

42 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

43 # Add the value to filenames.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

44 self.filenames.add(attribute[1])

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

45 # End of class FileListParser

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

46

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

47 def get_ctat_genome_filenames():

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

48 # open the url and retrieve the filenames of the files in the directory.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

49 resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/')

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

50 theHTML = resource.read()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

51 filelist_parser = FileListParser()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

52 filelist_parser.feed(theHTML)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

53 # return a tuple of the filenames

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

54 return tuple(filelist_parser.filenames)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

55

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

56 # The following was used by the example program to get input parameters through the json.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

57 # Just leaving here for reference.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

58 #def get_reference_id_name(params):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

59 # genome_id = params['param_dict']['genome_id']

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

60 # genome_name = params['param_dict']['genome_name']

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

61 # return genome_id, genome_name

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

62 #

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

63 #def get_url(params):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

64 # trained_url = params['param_dict']['trained_url']

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

65 # return trained_url

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

66

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

67 def download_from_BroadInst(src_filename, destination):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

68 # FIX - The name of this function is too narrow now. It does more than download.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

69 # Perhaps split function into its pieces and rename.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

70 # FIX - need to consider if this is a rerun of a failed processing or download

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

71 # If the files that would be downloaded exist and are the correct size, we should

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

72 # skip the download, also in post-processing we should see if the data has been

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

73 # processed before, and whether the processed files are the correct size?

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

74 # or do the functions themselves already check if the files are there and skip steps?

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

75 # Maybe add a field for the user to indicate to ignore/delete previous data and

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

76 # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

77 # and computing resources are needed to generate the indexes.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

78 ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

79 # FIX - Check that the download directory is empty if it exists.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

80 # why does it need to be empty? The downloaded file will be a single directory in that file.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

81 # Also, can we check if there is enough space on the device as well?

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

82 # FIX - Also we want to make sure that destination is absolute fully specified path.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

83 cannonical_destination = os.path.realpath(destination)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

84 if os.path.exists(cannonical_destination):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

85 if not os.path.isdir(cannonical_destination):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

86 raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination))

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

87 # else all is good. It is a directory.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

88 else:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

89 # We need to create it.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

90 os.makedirs(cannonical_destination)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

91 # Get the list of files in the directory, so after we extract the archive we can find the one

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

92 # that was extracted as the file that is not in this list.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

93 orig_files_in_destdir = set(os.listdir(cannonical_destination))

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

94 #Download ref: https://dzone.com/articles/how-download-file-python

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

95 #f = urllib2.urlopen(ctat_resource_lib)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

96 #data = f.read()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

97 #with open(filepath, 'wb') as code:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

98 # code.write(data)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

99 # another way

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

100 #full_filepath = os.path.join(destination, src_filename)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

101 #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

102 # Put the following into a try statement, so that if there is a failure

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

103 # something can be printed about it before reraising exception.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

104 #tarfile.open(full_filepath, mode='r:*').extractall()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

105 # But we want to transfer and untar it without storing the tar file, because that

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

106 # adds all that much more space to the needed amount of free space.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

107 # so use subprocess to pipe the output of curl into tar.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

108 command = "curl {:s} | tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

109 try: # to run the command that downloads and extracts the file.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

110 command_output = subprocess.check_output(command, shell=True)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

111 except subprocess.CalledProcessError as e:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

112 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

113 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

114 print "\tOutput while running the command was:\n\n{:s}".format(e.output)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

115 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

116 raise

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

117 # Get the root filename of the extracted file. It will be the file that was not in the directory

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

118 # before we did the download and extraction.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

119 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

120 found_filename = None

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

121 for filename in newfiles_in_destdir:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

122 # If things are right there should just be one new file, the directory that was extracted.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

123 # But in case there was something that happened on the system that created other files,

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

124 # the correct file's name should be a substring of the tar file that was downloaded.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

125 if filename in src_filename:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

126 found_filename = filename

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

127 if found_filename is not None:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

128 ctat_genome_directory = cannonical_destination + "/" + found_filename

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

129 if len(os.listdir(ctat_genome_directory)) == 1:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

130 # Then that one file is a subdirectory that should be the ctat_genome_directory.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

131 subdir_filename = os.listdir(ctat_genome_directory)[0]

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

132 ctat_genome_directory += "/" + subdir_filename

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

133 else:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

134 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

135 "\n\t{:s}".format(cannonical_destination))

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

136

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

137 # In all downloaded libraries there is additional processing

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

138 # that needs to happen for gmap-fusion to work.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

139 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

140 ctat_genome_directory, ctat_genome_directory)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

141 try: # to run the command.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

142 command_output = subprocess.check_output(command, shell=True)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

143 except subprocess.CalledProcessError as e:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

144 print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

145 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

146 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

147 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

148 raise

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

149

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

150 # If the src_filename indicates it is a source file, as opposed to plug-n-play,

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

151 # then we need to do additional post processing on it with FusionFilter commands.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

152 if src_filename.split(".").contains("source_data"):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

153 # The use of conda to install the FusionFilter should make the following commands

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

154 # available without the need to find out where FusionFilter resides.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

155 # ${FusionFilter_HOME}/prep_genome_lib.pl \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

156 # --genome_fa ref_genome.fa \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

157 # --gtf ref_annot.gtf \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

158 # --blast_pairs blast_pairs.gene_syms.outfmt6.gz \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

159 # --fusion_annot_lib fusion_lib.dat.gz

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

160 # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

161 # --pfam_domains PFAM.domtblout.dat.gz \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

162 # --genome_lib_dir ctat_genome_lib_build_dir

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

163 #

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

164 # I don't know if we can run the programs without changing to the directory.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

165 # The instructions in https://github.com/FusionFilter/FusionFilter/wiki

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

166 # say to change directory before running the commands.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

167 os.chdir(ctat_genome_directory)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

168 command = "prep_genome_lib.pl " + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

169 "--genome_fa ref_genome.fa " + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

170 "--gtf ref_annot.gtf " + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

171 "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

172 "--fusion_annot_lib fusion_lib.dat.gz"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

173 try: # to run the command.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

174 command_output = subprocess.check_output(command, shell=True)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

175 except subprocess.CalledProcessError as e:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

176 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

177 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

178 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

179 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

180 raise

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

181 command = "index_pfam_domain_info.pl " + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

182 "--pfam_domains PFAM.domtblout.dat.gz " + \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

183 "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

184 try: # to run the command.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

185 command_output = subprocess.check_output(command, shell=True)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

186 except subprocess.CalledProcessError as e:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

187 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

188 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

189 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

190 print "================================================"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

191 raise

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

192 # end of post-processing for source_data files

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

193

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

194 return ctat_genome_directory

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

195

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

196 def main():

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

197 #Parse Command Line

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

198 parser = argparse.ArgumentParser()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

199 parser.add_argument('-d', '--download', default="", \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

200 help='Do not use if you already have a CTAT Resource Library that this program downloads.')

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

201 parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

202 help='Is used as the selector text of the entry in the data table.')

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

203 parser.add_argument('-p', '--destination_path', \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

204 help='Full path of the CTAT Resource Library location or destination.')

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

205 parser.add_argument('-o', '--output_filename', \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

206 help='Name of the output file, where the json dictionary will be written.')

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

207 args = parser.parse_args()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

208

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

209 # All of the input parameters are written by default to the json output file prior to

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

210 # this program being called.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

211 # But I do not get input values from the json file, but rather from command line.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

212 # Just leaving the following code as a comment, in case it might be useful to someone later.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

213 # The target_directory is the typical location where data managers put their data, but then

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

214 # the typical model is to then copy it to the final location. With our files taking up so many

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

215 # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

216 # the final location (the destination_path) where the files will be placed (or currently reside).

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

217 #

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

218 # params = from_json_string(open(output_filename).read())

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

219 # target_directory = params['output_data'][0]['extra_files_path']

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

220 # os.mkdir(target_directory)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

221

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

222 if args.download != "":

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

223 ctat_genome_resource_lib_path = \

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

224 download_from_BroadInst(src_filename=args.download, destination=args.destination_path)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

225 else:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

226 # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

227 ctat_genome_resource_lib_path = args.destination_path

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

228

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

229 if (args.genome_name is None) or (args.genome_name == ""):

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

230 genome_name = "GRCh38_gencode_v26"

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

231 else:

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

232 genome_name = args.genome_name

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

233 # Set the table_entry_value to the basename of the directory path minus the extension.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

234 # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing.

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

235 table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0]

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

236 data_manager_dict = {}

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

237 data_manager_dict['data_tables'] = {}

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

238 data_manager_dict['data_tables']['ctat_genome_ref_libs'] = []

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

239 data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

240 data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry)

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

241

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

242 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

243 # which then puts it into the correct .loc file (I think).

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

244 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

245

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

246 if __name__ == "__main__":

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

247 main()

fcf82328f066 new version. Not tested yet.

trinity_ctat

parents:

diff changeset

248

Mercurial > repos > trinity_ctat > not_used

annotate data_manager/add_ctat_ref_lib.py @ 0:fcf82328f066 draft default tip