0
|
1 #!/usr/bin/env python
|
|
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/
|
|
3
|
|
4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and
|
|
5 # other example code on the web.
|
|
6 # This now allows downloading of a user selected library
|
|
7 # but only from the CTAT Genome Reference Library website.
|
|
8 # Ultimately we might want to allow the user to specify any location
|
|
9 # from which to download.
|
|
10 # Users can create or download other libraries and use this tool to add them if they don't want
|
|
11 # or don't know how to add them by hand.
|
|
12
|
|
13 import argparse
|
|
14 import os
|
|
15 #import tarfile
|
|
16 #import urllib
|
|
17 import subprocess
|
|
18
|
|
19 from galaxy.util.json import from_json_string, to_json_string
|
|
20
|
|
21 # The following function is used by the Data Manager interface (.xml file) to get the
|
|
22 # filenames that are available online at broadinstitute.org
|
|
23 # Not sure best way to do it. This method parses the html looking for the filenames.
|
|
24 import urllib2
|
|
25 from HTMLParser import HTMLParser
|
|
26
|
|
27 class FileListParser(HTMLParser):
|
|
28 def __init__(self):
|
|
29 # Have to use direct call to super class rather than using super():
|
|
30 # super(FileListParser, self).__init__()
|
|
31 # because HTMLParser is an "old style" class and its inheritance chain does not include object.
|
|
32 HTMLParser.__init__(self)
|
|
33 self.filenames = set()
|
|
34 def handle_starttag(self, tag, attrs):
|
|
35 # Look for filename references in anchor tags and add them to filenames.
|
|
36 if tag == "a":
|
|
37 # The tag is an anchor tag.
|
|
38 for attribute in attrs:
|
|
39 # print "Checking: {:s}".format(str(attribute))
|
|
40 if attribute[0] == "href":
|
|
41 # Does the href have a tar.gz in it?
|
|
42 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]):
|
|
43 # Add the value to filenames.
|
|
44 self.filenames.add(attribute[1])
|
|
45 # End of class FileListParser
|
|
46
|
|
47 def get_ctat_genome_filenames():
|
|
48 # open the url and retrieve the filenames of the files in the directory.
|
|
49 resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/')
|
|
50 theHTML = resource.read()
|
|
51 filelist_parser = FileListParser()
|
|
52 filelist_parser.feed(theHTML)
|
|
53 # return a tuple of the filenames
|
|
54 return tuple(filelist_parser.filenames)
|
|
55
|
|
56 # The following was used by the example program to get input parameters through the json.
|
|
57 # Just leaving here for reference.
|
|
58 #def get_reference_id_name(params):
|
|
59 # genome_id = params['param_dict']['genome_id']
|
|
60 # genome_name = params['param_dict']['genome_name']
|
|
61 # return genome_id, genome_name
|
|
62 #
|
|
63 #def get_url(params):
|
|
64 # trained_url = params['param_dict']['trained_url']
|
|
65 # return trained_url
|
|
66
|
|
67 def download_from_BroadInst(src_filename, destination):
|
|
68 # FIX - The name of this function is too narrow now. It does more than download.
|
|
69 # Perhaps split function into its pieces and rename.
|
|
70 # FIX - need to consider if this is a rerun of a failed processing or download
|
|
71 # If the files that would be downloaded exist and are the correct size, we should
|
|
72 # skip the download, also in post-processing we should see if the data has been
|
|
73 # processed before, and whether the processed files are the correct size?
|
|
74 # or do the functions themselves already check if the files are there and skip steps?
|
|
75 # Maybe add a field for the user to indicate to ignore/delete previous data and
|
|
76 # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory
|
|
77 # and computing resources are needed to generate the indexes.
|
|
78 ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename
|
|
79 # FIX - Check that the download directory is empty if it exists.
|
|
80 # why does it need to be empty? The downloaded file will be a single directory in that file.
|
|
81 # Also, can we check if there is enough space on the device as well?
|
|
82 # FIX - Also we want to make sure that destination is absolute fully specified path.
|
|
83 cannonical_destination = os.path.realpath(destination)
|
|
84 if os.path.exists(cannonical_destination):
|
|
85 if not os.path.isdir(cannonical_destination):
|
|
86 raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination))
|
|
87 # else all is good. It is a directory.
|
|
88 else:
|
|
89 # We need to create it.
|
|
90 os.makedirs(cannonical_destination)
|
|
91 # Get the list of files in the directory, so after we extract the archive we can find the one
|
|
92 # that was extracted as the file that is not in this list.
|
|
93 orig_files_in_destdir = set(os.listdir(cannonical_destination))
|
|
94 #Download ref: https://dzone.com/articles/how-download-file-python
|
|
95 #f = urllib2.urlopen(ctat_resource_lib)
|
|
96 #data = f.read()
|
|
97 #with open(filepath, 'wb') as code:
|
|
98 # code.write(data)
|
|
99 # another way
|
|
100 #full_filepath = os.path.join(destination, src_filename)
|
|
101 #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath)
|
|
102 # Put the following into a try statement, so that if there is a failure
|
|
103 # something can be printed about it before reraising exception.
|
|
104 #tarfile.open(full_filepath, mode='r:*').extractall()
|
|
105 # But we want to transfer and untar it without storing the tar file, because that
|
|
106 # adds all that much more space to the needed amount of free space.
|
|
107 # so use subprocess to pipe the output of curl into tar.
|
|
108 command = "curl {:s} | tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination)
|
|
109 try: # to run the command that downloads and extracts the file.
|
|
110 command_output = subprocess.check_output(command, shell=True)
|
|
111 except subprocess.CalledProcessError as e:
|
|
112 print "ERROR: Trying to run the following command:\n\t{:s}".format(command)
|
|
113 print "================================================"
|
|
114 print "\tOutput while running the command was:\n\n{:s}".format(e.output)
|
|
115 print "================================================"
|
|
116 raise
|
|
117 # Get the root filename of the extracted file. It will be the file that was not in the directory
|
|
118 # before we did the download and extraction.
|
|
119 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir
|
|
120 found_filename = None
|
|
121 for filename in newfiles_in_destdir:
|
|
122 # If things are right there should just be one new file, the directory that was extracted.
|
|
123 # But in case there was something that happened on the system that created other files,
|
|
124 # the correct file's name should be a substring of the tar file that was downloaded.
|
|
125 if filename in src_filename:
|
|
126 found_filename = filename
|
|
127 if found_filename is not None:
|
|
128 ctat_genome_directory = cannonical_destination + "/" + found_filename
|
|
129 if len(os.listdir(ctat_genome_directory)) == 1:
|
|
130 # Then that one file is a subdirectory that should be the ctat_genome_directory.
|
|
131 subdir_filename = os.listdir(ctat_genome_directory)[0]
|
|
132 ctat_genome_directory += "/" + subdir_filename
|
|
133 else:
|
|
134 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \
|
|
135 "\n\t{:s}".format(cannonical_destination))
|
|
136
|
|
137 # In all downloaded libraries there is additional processing
|
|
138 # that needs to happen for gmap-fusion to work.
|
|
139 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \
|
|
140 ctat_genome_directory, ctat_genome_directory)
|
|
141 try: # to run the command.
|
|
142 command_output = subprocess.check_output(command, shell=True)
|
|
143 except subprocess.CalledProcessError as e:
|
|
144 print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command)
|
|
145 print "================================================"
|
|
146 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
|
|
147 print "================================================"
|
|
148 raise
|
|
149
|
|
150 # If the src_filename indicates it is a source file, as opposed to plug-n-play,
|
|
151 # then we need to do additional post processing on it with FusionFilter commands.
|
|
152 if src_filename.split(".").contains("source_data"):
|
|
153 # The use of conda to install the FusionFilter should make the following commands
|
|
154 # available without the need to find out where FusionFilter resides.
|
|
155 # ${FusionFilter_HOME}/prep_genome_lib.pl \
|
|
156 # --genome_fa ref_genome.fa \
|
|
157 # --gtf ref_annot.gtf \
|
|
158 # --blast_pairs blast_pairs.gene_syms.outfmt6.gz \
|
|
159 # --fusion_annot_lib fusion_lib.dat.gz
|
|
160 # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl \
|
|
161 # --pfam_domains PFAM.domtblout.dat.gz \
|
|
162 # --genome_lib_dir ctat_genome_lib_build_dir
|
|
163 #
|
|
164 # I don't know if we can run the programs without changing to the directory.
|
|
165 # The instructions in https://github.com/FusionFilter/FusionFilter/wiki
|
|
166 # say to change directory before running the commands.
|
|
167 os.chdir(ctat_genome_directory)
|
|
168 command = "prep_genome_lib.pl " + \
|
|
169 "--genome_fa ref_genome.fa " + \
|
|
170 "--gtf ref_annot.gtf " + \
|
|
171 "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \
|
|
172 "--fusion_annot_lib fusion_lib.dat.gz"
|
|
173 try: # to run the command.
|
|
174 command_output = subprocess.check_output(command, shell=True)
|
|
175 except subprocess.CalledProcessError as e:
|
|
176 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)
|
|
177 print "================================================"
|
|
178 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
|
|
179 print "================================================"
|
|
180 raise
|
|
181 command = "index_pfam_domain_info.pl " + \
|
|
182 "--pfam_domains PFAM.domtblout.dat.gz " + \
|
|
183 "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory)
|
|
184 try: # to run the command.
|
|
185 command_output = subprocess.check_output(command, shell=True)
|
|
186 except subprocess.CalledProcessError as e:
|
|
187 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command)
|
|
188 print "================================================"
|
|
189 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output)
|
|
190 print "================================================"
|
|
191 raise
|
|
192 # end of post-processing for source_data files
|
|
193
|
|
194 return ctat_genome_directory
|
|
195
|
|
196 def main():
|
|
197 #Parse Command Line
|
|
198 parser = argparse.ArgumentParser()
|
|
199 parser.add_argument('-d', '--download', default="", \
|
|
200 help='Do not use if you already have a CTAT Resource Library that this program downloads.')
|
|
201 parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \
|
|
202 help='Is used as the selector text of the entry in the data table.')
|
|
203 parser.add_argument('-p', '--destination_path', \
|
|
204 help='Full path of the CTAT Resource Library location or destination.')
|
|
205 parser.add_argument('-o', '--output_filename', \
|
|
206 help='Name of the output file, where the json dictionary will be written.')
|
|
207 args = parser.parse_args()
|
|
208
|
|
209 # All of the input parameters are written by default to the json output file prior to
|
|
210 # this program being called.
|
|
211 # But I do not get input values from the json file, but rather from command line.
|
|
212 # Just leaving the following code as a comment, in case it might be useful to someone later.
|
|
213 # The target_directory is the typical location where data managers put their data, but then
|
|
214 # the typical model is to then copy it to the final location. With our files taking up so many
|
|
215 # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us
|
|
216 # the final location (the destination_path) where the files will be placed (or currently reside).
|
|
217 #
|
|
218 # params = from_json_string(open(output_filename).read())
|
|
219 # target_directory = params['output_data'][0]['extra_files_path']
|
|
220 # os.mkdir(target_directory)
|
|
221
|
|
222 if args.download != "":
|
|
223 ctat_genome_resource_lib_path = \
|
|
224 download_from_BroadInst(src_filename=args.download, destination=args.destination_path)
|
|
225 else:
|
|
226 # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there.
|
|
227 ctat_genome_resource_lib_path = args.destination_path
|
|
228
|
|
229 if (args.genome_name is None) or (args.genome_name == ""):
|
|
230 genome_name = "GRCh38_gencode_v26"
|
|
231 else:
|
|
232 genome_name = args.genome_name
|
|
233 # Set the table_entry_value to the basename of the directory path minus the extension.
|
|
234 # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing.
|
|
235 table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0]
|
|
236 data_manager_dict = {}
|
|
237 data_manager_dict['data_tables'] = {}
|
|
238 data_manager_dict['data_tables']['ctat_genome_ref_libs'] = []
|
|
239 data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path)
|
|
240 data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry)
|
|
241
|
|
242 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager,
|
|
243 # which then puts it into the correct .loc file (I think).
|
|
244 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict))
|
|
245
|
|
246 if __name__ == "__main__":
|
|
247 main()
|
|
248
|