Mercurial > repos > trinity_ctat > not_used
comparison data_manager/add_ctat_ref_lib.py @ 0:fcf82328f066 draft default tip
new version. Not tested yet.
author | trinity_ctat |
---|---|
date | Mon, 22 Jan 2018 16:06:42 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:fcf82328f066 |
---|---|
1 #!/usr/bin/env python | |
2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/ | |
3 | |
4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and | |
5 # other example code on the web. | |
6 # This now allows downloading of a user selected library | |
7 # but only from the CTAT Genome Reference Library website. | |
8 # Ultimately we might want to allow the user to specify any location | |
9 # from which to download. | |
10 # Users can create or download other libraries and use this tool to add them if they don't want | |
11 # or don't know how to add them by hand. | |
12 | |
13 import argparse | |
14 import os | |
15 #import tarfile | |
16 #import urllib | |
17 import subprocess | |
18 | |
19 from galaxy.util.json import from_json_string, to_json_string | |
20 | |
21 # The following function is used by the Data Manager interface (.xml file) to get the | |
22 # filenames that are available online at broadinstitute.org | |
23 # Not sure best way to do it. This method parses the html looking for the filenames. | |
24 import urllib2 | |
25 from HTMLParser import HTMLParser | |
26 | |
27 class FileListParser(HTMLParser): | |
28 def __init__(self): | |
29 # Have to use direct call to super class rather than using super(): | |
30 # super(FileListParser, self).__init__() | |
31 # because HTMLParser is an "old style" class and its inheritance chain does not include object. | |
32 HTMLParser.__init__(self) | |
33 self.filenames = set() | |
34 def handle_starttag(self, tag, attrs): | |
35 # Look for filename references in anchor tags and add them to filenames. | |
36 if tag == "a": | |
37 # The tag is an anchor tag. | |
38 for attribute in attrs: | |
39 # print "Checking: {:s}".format(str(attribute)) | |
40 if attribute[0] == "href": | |
41 # Does the href have a tar.gz in it? | |
42 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]): | |
43 # Add the value to filenames. | |
44 self.filenames.add(attribute[1]) | |
45 # End of class FileListParser | |
46 | |
47 def get_ctat_genome_filenames(): | |
48 # open the url and retrieve the filenames of the files in the directory. | |
49 resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/') | |
50 theHTML = resource.read() | |
51 filelist_parser = FileListParser() | |
52 filelist_parser.feed(theHTML) | |
53 # return a tuple of the filenames | |
54 return tuple(filelist_parser.filenames) | |
55 | |
56 # The following was used by the example program to get input parameters through the json. | |
57 # Just leaving here for reference. | |
58 #def get_reference_id_name(params): | |
59 # genome_id = params['param_dict']['genome_id'] | |
60 # genome_name = params['param_dict']['genome_name'] | |
61 # return genome_id, genome_name | |
62 # | |
63 #def get_url(params): | |
64 # trained_url = params['param_dict']['trained_url'] | |
65 # return trained_url | |
66 | |
67 def download_from_BroadInst(src_filename, destination): | |
68 # FIX - The name of this function is too narrow now. It does more than download. | |
69 # Perhaps split function into its pieces and rename. | |
70 # FIX - need to consider if this is a rerun of a failed processing or download | |
71 # If the files that would be downloaded exist and are the correct size, we should | |
72 # skip the download, also in post-processing we should see if the data has been | |
73 # processed before, and whether the processed files are the correct size? | |
74 # or do the functions themselves already check if the files are there and skip steps? | |
75 # Maybe add a field for the user to indicate to ignore/delete previous data and | |
76 # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory | |
77 # and computing resources are needed to generate the indexes. | |
78 ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename | |
79 # FIX - Check that the download directory is empty if it exists. | |
80 # why does it need to be empty? The downloaded file will be a single directory in that file. | |
81 # Also, can we check if there is enough space on the device as well? | |
82 # FIX - Also we want to make sure that destination is absolute fully specified path. | |
83 cannonical_destination = os.path.realpath(destination) | |
84 if os.path.exists(cannonical_destination): | |
85 if not os.path.isdir(cannonical_destination): | |
86 raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination)) | |
87 # else all is good. It is a directory. | |
88 else: | |
89 # We need to create it. | |
90 os.makedirs(cannonical_destination) | |
91 # Get the list of files in the directory, so after we extract the archive we can find the one | |
92 # that was extracted as the file that is not in this list. | |
93 orig_files_in_destdir = set(os.listdir(cannonical_destination)) | |
94 #Download ref: https://dzone.com/articles/how-download-file-python | |
95 #f = urllib2.urlopen(ctat_resource_lib) | |
96 #data = f.read() | |
97 #with open(filepath, 'wb') as code: | |
98 # code.write(data) | |
99 # another way | |
100 #full_filepath = os.path.join(destination, src_filename) | |
101 #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath) | |
102 # Put the following into a try statement, so that if there is a failure | |
103 # something can be printed about it before reraising exception. | |
104 #tarfile.open(full_filepath, mode='r:*').extractall() | |
105 # But we want to transfer and untar it without storing the tar file, because that | |
106 # adds all that much more space to the needed amount of free space. | |
107 # so use subprocess to pipe the output of curl into tar. | |
108 command = "curl {:s} | tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination) | |
109 try: # to run the command that downloads and extracts the file. | |
110 command_output = subprocess.check_output(command, shell=True) | |
111 except subprocess.CalledProcessError as e: | |
112 print "ERROR: Trying to run the following command:\n\t{:s}".format(command) | |
113 print "================================================" | |
114 print "\tOutput while running the command was:\n\n{:s}".format(e.output) | |
115 print "================================================" | |
116 raise | |
117 # Get the root filename of the extracted file. It will be the file that was not in the directory | |
118 # before we did the download and extraction. | |
119 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir | |
120 found_filename = None | |
121 for filename in newfiles_in_destdir: | |
122 # If things are right there should just be one new file, the directory that was extracted. | |
123 # But in case there was something that happened on the system that created other files, | |
124 # the correct file's name should be a substring of the tar file that was downloaded. | |
125 if filename in src_filename: | |
126 found_filename = filename | |
127 if found_filename is not None: | |
128 ctat_genome_directory = cannonical_destination + "/" + found_filename | |
129 if len(os.listdir(ctat_genome_directory)) == 1: | |
130 # Then that one file is a subdirectory that should be the ctat_genome_directory. | |
131 subdir_filename = os.listdir(ctat_genome_directory)[0] | |
132 ctat_genome_directory += "/" + subdir_filename | |
133 else: | |
134 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \ | |
135 "\n\t{:s}".format(cannonical_destination)) | |
136 | |
137 # In all downloaded libraries there is additional processing | |
138 # that needs to happen for gmap-fusion to work. | |
139 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \ | |
140 ctat_genome_directory, ctat_genome_directory) | |
141 try: # to run the command. | |
142 command_output = subprocess.check_output(command, shell=True) | |
143 except subprocess.CalledProcessError as e: | |
144 print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command) | |
145 print "================================================" | |
146 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) | |
147 print "================================================" | |
148 raise | |
149 | |
150 # If the src_filename indicates it is a source file, as opposed to plug-n-play, | |
151 # then we need to do additional post processing on it with FusionFilter commands. | |
152 if src_filename.split(".").contains("source_data"): | |
153 # The use of conda to install the FusionFilter should make the following commands | |
154 # available without the need to find out where FusionFilter resides. | |
155 # ${FusionFilter_HOME}/prep_genome_lib.pl \ | |
156 # --genome_fa ref_genome.fa \ | |
157 # --gtf ref_annot.gtf \ | |
158 # --blast_pairs blast_pairs.gene_syms.outfmt6.gz \ | |
159 # --fusion_annot_lib fusion_lib.dat.gz | |
160 # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl \ | |
161 # --pfam_domains PFAM.domtblout.dat.gz \ | |
162 # --genome_lib_dir ctat_genome_lib_build_dir | |
163 # | |
164 # I don't know if we can run the programs without changing to the directory. | |
165 # The instructions in https://github.com/FusionFilter/FusionFilter/wiki | |
166 # say to change directory before running the commands. | |
167 os.chdir(ctat_genome_directory) | |
168 command = "prep_genome_lib.pl " + \ | |
169 "--genome_fa ref_genome.fa " + \ | |
170 "--gtf ref_annot.gtf " + \ | |
171 "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \ | |
172 "--fusion_annot_lib fusion_lib.dat.gz" | |
173 try: # to run the command. | |
174 command_output = subprocess.check_output(command, shell=True) | |
175 except subprocess.CalledProcessError as e: | |
176 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command) | |
177 print "================================================" | |
178 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) | |
179 print "================================================" | |
180 raise | |
181 command = "index_pfam_domain_info.pl " + \ | |
182 "--pfam_domains PFAM.domtblout.dat.gz " + \ | |
183 "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory) | |
184 try: # to run the command. | |
185 command_output = subprocess.check_output(command, shell=True) | |
186 except subprocess.CalledProcessError as e: | |
187 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command) | |
188 print "================================================" | |
189 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) | |
190 print "================================================" | |
191 raise | |
192 # end of post-processing for source_data files | |
193 | |
194 return ctat_genome_directory | |
195 | |
196 def main(): | |
197 #Parse Command Line | |
198 parser = argparse.ArgumentParser() | |
199 parser.add_argument('-d', '--download', default="", \ | |
200 help='Do not use if you already have a CTAT Resource Library that this program downloads.') | |
201 parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \ | |
202 help='Is used as the selector text of the entry in the data table.') | |
203 parser.add_argument('-p', '--destination_path', \ | |
204 help='Full path of the CTAT Resource Library location or destination.') | |
205 parser.add_argument('-o', '--output_filename', \ | |
206 help='Name of the output file, where the json dictionary will be written.') | |
207 args = parser.parse_args() | |
208 | |
209 # All of the input parameters are written by default to the json output file prior to | |
210 # this program being called. | |
211 # But I do not get input values from the json file, but rather from command line. | |
212 # Just leaving the following code as a comment, in case it might be useful to someone later. | |
213 # The target_directory is the typical location where data managers put their data, but then | |
214 # the typical model is to then copy it to the final location. With our files taking up so many | |
215 # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us | |
216 # the final location (the destination_path) where the files will be placed (or currently reside). | |
217 # | |
218 # params = from_json_string(open(output_filename).read()) | |
219 # target_directory = params['output_data'][0]['extra_files_path'] | |
220 # os.mkdir(target_directory) | |
221 | |
222 if args.download != "": | |
223 ctat_genome_resource_lib_path = \ | |
224 download_from_BroadInst(src_filename=args.download, destination=args.destination_path) | |
225 else: | |
226 # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there. | |
227 ctat_genome_resource_lib_path = args.destination_path | |
228 | |
229 if (args.genome_name is None) or (args.genome_name == ""): | |
230 genome_name = "GRCh38_gencode_v26" | |
231 else: | |
232 genome_name = args.genome_name | |
233 # Set the table_entry_value to the basename of the directory path minus the extension. | |
234 # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing. | |
235 table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0] | |
236 data_manager_dict = {} | |
237 data_manager_dict['data_tables'] = {} | |
238 data_manager_dict['data_tables']['ctat_genome_ref_libs'] = [] | |
239 data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path) | |
240 data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry) | |
241 | |
242 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager, | |
243 # which then puts it into the correct .loc file (I think). | |
244 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict)) | |
245 | |
246 if __name__ == "__main__": | |
247 main() | |
248 |