Mercurial > repos > trinity_ctat > not_used
comparison data_manager/add_ctat_ref_lib.py @ 0:fcf82328f066 draft default tip
new version. Not tested yet.
| author | trinity_ctat |
|---|---|
| date | Mon, 22 Jan 2018 16:06:42 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fcf82328f066 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # ref: https://galaxyproject.org/admin/tools/data-managers/how-to/define/ | |
| 3 | |
| 4 # Rewritten by H.E. Cicada Brokaw Dennis from a source downloaded from the toolshed and | |
| 5 # other example code on the web. | |
| 6 # This now allows downloading of a user selected library | |
| 7 # but only from the CTAT Genome Reference Library website. | |
| 8 # Ultimately we might want to allow the user to specify any location | |
| 9 # from which to download. | |
| 10 # Users can create or download other libraries and use this tool to add them if they don't want | |
| 11 # or don't know how to add them by hand. | |
| 12 | |
| 13 import argparse | |
| 14 import os | |
| 15 #import tarfile | |
| 16 #import urllib | |
| 17 import subprocess | |
| 18 | |
| 19 from galaxy.util.json import from_json_string, to_json_string | |
| 20 | |
| 21 # The following function is used by the Data Manager interface (.xml file) to get the | |
| 22 # filenames that are available online at broadinstitute.org | |
| 23 # Not sure best way to do it. This method parses the html looking for the filenames. | |
| 24 import urllib2 | |
| 25 from HTMLParser import HTMLParser | |
| 26 | |
| 27 class FileListParser(HTMLParser): | |
| 28 def __init__(self): | |
| 29 # Have to use direct call to super class rather than using super(): | |
| 30 # super(FileListParser, self).__init__() | |
| 31 # because HTMLParser is an "old style" class and its inheritance chain does not include object. | |
| 32 HTMLParser.__init__(self) | |
| 33 self.filenames = set() | |
| 34 def handle_starttag(self, tag, attrs): | |
| 35 # Look for filename references in anchor tags and add them to filenames. | |
| 36 if tag == "a": | |
| 37 # The tag is an anchor tag. | |
| 38 for attribute in attrs: | |
| 39 # print "Checking: {:s}".format(str(attribute)) | |
| 40 if attribute[0] == "href": | |
| 41 # Does the href have a tar.gz in it? | |
| 42 if ("tar.gz" in attribute[1]) and ("md5" not in attribute[1]): | |
| 43 # Add the value to filenames. | |
| 44 self.filenames.add(attribute[1]) | |
| 45 # End of class FileListParser | |
| 46 | |
| 47 def get_ctat_genome_filenames(): | |
| 48 # open the url and retrieve the filenames of the files in the directory. | |
| 49 resource = urllib2.urlopen('https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/') | |
| 50 theHTML = resource.read() | |
| 51 filelist_parser = FileListParser() | |
| 52 filelist_parser.feed(theHTML) | |
| 53 # return a tuple of the filenames | |
| 54 return tuple(filelist_parser.filenames) | |
| 55 | |
| 56 # The following was used by the example program to get input parameters through the json. | |
| 57 # Just leaving here for reference. | |
| 58 #def get_reference_id_name(params): | |
| 59 # genome_id = params['param_dict']['genome_id'] | |
| 60 # genome_name = params['param_dict']['genome_name'] | |
| 61 # return genome_id, genome_name | |
| 62 # | |
| 63 #def get_url(params): | |
| 64 # trained_url = params['param_dict']['trained_url'] | |
| 65 # return trained_url | |
| 66 | |
| 67 def download_from_BroadInst(src_filename, destination): | |
| 68 # FIX - The name of this function is too narrow now. It does more than download. | |
| 69 # Perhaps split function into its pieces and rename. | |
| 70 # FIX - need to consider if this is a rerun of a failed processing or download | |
| 71 # If the files that would be downloaded exist and are the correct size, we should | |
| 72 # skip the download, also in post-processing we should see if the data has been | |
| 73 # processed before, and whether the processed files are the correct size? | |
| 74 # or do the functions themselves already check if the files are there and skip steps? | |
| 75 # Maybe add a field for the user to indicate to ignore/delete previous data and | |
| 76 # redownload and reprocess. In Notes to Galaxy Admin recommend that certain memory | |
| 77 # and computing resources are needed to generate the indexes. | |
| 78 ctat_resource_lib = 'https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/' + src_filename | |
| 79 # FIX - Check that the download directory is empty if it exists. | |
| 80 # why does it need to be empty? The downloaded file will be a single directory in that file. | |
| 81 # Also, can we check if there is enough space on the device as well? | |
| 82 # FIX - Also we want to make sure that destination is absolute fully specified path. | |
| 83 cannonical_destination = os.path.realpath(destination) | |
| 84 if os.path.exists(cannonical_destination): | |
| 85 if not os.path.isdir(cannonical_destination): | |
| 86 raise ValueError("The destination is not a directory: {:s}".format(cannonical_destination)) | |
| 87 # else all is good. It is a directory. | |
| 88 else: | |
| 89 # We need to create it. | |
| 90 os.makedirs(cannonical_destination) | |
| 91 # Get the list of files in the directory, so after we extract the archive we can find the one | |
| 92 # that was extracted as the file that is not in this list. | |
| 93 orig_files_in_destdir = set(os.listdir(cannonical_destination)) | |
| 94 #Download ref: https://dzone.com/articles/how-download-file-python | |
| 95 #f = urllib2.urlopen(ctat_resource_lib) | |
| 96 #data = f.read() | |
| 97 #with open(filepath, 'wb') as code: | |
| 98 # code.write(data) | |
| 99 # another way | |
| 100 #full_filepath = os.path.join(destination, src_filename) | |
| 101 #urllib.urlretrieve(url=ctat_resource_lib, filename=full_filepath) | |
| 102 # Put the following into a try statement, so that if there is a failure | |
| 103 # something can be printed about it before reraising exception. | |
| 104 #tarfile.open(full_filepath, mode='r:*').extractall() | |
| 105 # But we want to transfer and untar it without storing the tar file, because that | |
| 106 # adds all that much more space to the needed amount of free space. | |
| 107 # so use subprocess to pipe the output of curl into tar. | |
| 108 command = "curl {:s} | tar -xzvf - -C {:s}".format(ctat_resource_lib, cannonical_destination) | |
| 109 try: # to run the command that downloads and extracts the file. | |
| 110 command_output = subprocess.check_output(command, shell=True) | |
| 111 except subprocess.CalledProcessError as e: | |
| 112 print "ERROR: Trying to run the following command:\n\t{:s}".format(command) | |
| 113 print "================================================" | |
| 114 print "\tOutput while running the command was:\n\n{:s}".format(e.output) | |
| 115 print "================================================" | |
| 116 raise | |
| 117 # Get the root filename of the extracted file. It will be the file that was not in the directory | |
| 118 # before we did the download and extraction. | |
| 119 newfiles_in_destdir = set(os.listdir(cannonical_destination)) - orig_files_in_destdir | |
| 120 found_filename = None | |
| 121 for filename in newfiles_in_destdir: | |
| 122 # If things are right there should just be one new file, the directory that was extracted. | |
| 123 # But in case there was something that happened on the system that created other files, | |
| 124 # the correct file's name should be a substring of the tar file that was downloaded. | |
| 125 if filename in src_filename: | |
| 126 found_filename = filename | |
| 127 if found_filename is not None: | |
| 128 ctat_genome_directory = cannonical_destination + "/" + found_filename | |
| 129 if len(os.listdir(ctat_genome_directory)) == 1: | |
| 130 # Then that one file is a subdirectory that should be the ctat_genome_directory. | |
| 131 subdir_filename = os.listdir(ctat_genome_directory)[0] | |
| 132 ctat_genome_directory += "/" + subdir_filename | |
| 133 else: | |
| 134 raise ValueError("ERROR: Could not find the extracted file in the destination directory:" + \ | |
| 135 "\n\t{:s}".format(cannonical_destination)) | |
| 136 | |
| 137 # In all downloaded libraries there is additional processing | |
| 138 # that needs to happen for gmap-fusion to work. | |
| 139 command = "gmap_build -D {:s}/ -d ref_genome.fa.gmap -k 13 {:s}/ref_genome.fa".format( \ | |
| 140 ctat_genome_directory, ctat_genome_directory) | |
| 141 try: # to run the command. | |
| 142 command_output = subprocess.check_output(command, shell=True) | |
| 143 except subprocess.CalledProcessError as e: | |
| 144 print "ERROR: While trying to process the genome library library:\n\t{:s}".format(command) | |
| 145 print "================================================" | |
| 146 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) | |
| 147 print "================================================" | |
| 148 raise | |
| 149 | |
| 150 # If the src_filename indicates it is a source file, as opposed to plug-n-play, | |
| 151 # then we need to do additional post processing on it with FusionFilter commands. | |
| 152 if src_filename.split(".").contains("source_data"): | |
| 153 # The use of conda to install the FusionFilter should make the following commands | |
| 154 # available without the need to find out where FusionFilter resides. | |
| 155 # ${FusionFilter_HOME}/prep_genome_lib.pl \ | |
| 156 # --genome_fa ref_genome.fa \ | |
| 157 # --gtf ref_annot.gtf \ | |
| 158 # --blast_pairs blast_pairs.gene_syms.outfmt6.gz \ | |
| 159 # --fusion_annot_lib fusion_lib.dat.gz | |
| 160 # ${FusionFilter_HOME}/util/index_pfam_domain_info.pl \ | |
| 161 # --pfam_domains PFAM.domtblout.dat.gz \ | |
| 162 # --genome_lib_dir ctat_genome_lib_build_dir | |
| 163 # | |
| 164 # I don't know if we can run the programs without changing to the directory. | |
| 165 # The instructions in https://github.com/FusionFilter/FusionFilter/wiki | |
| 166 # say to change directory before running the commands. | |
| 167 os.chdir(ctat_genome_directory) | |
| 168 command = "prep_genome_lib.pl " + \ | |
| 169 "--genome_fa ref_genome.fa " + \ | |
| 170 "--gtf ref_annot.gtf " + \ | |
| 171 "--blast_pairs blast_pairs.gene_syms.outfmt6.gz " + \ | |
| 172 "--fusion_annot_lib fusion_lib.dat.gz" | |
| 173 try: # to run the command. | |
| 174 command_output = subprocess.check_output(command, shell=True) | |
| 175 except subprocess.CalledProcessError as e: | |
| 176 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command) | |
| 177 print "================================================" | |
| 178 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) | |
| 179 print "================================================" | |
| 180 raise | |
| 181 command = "index_pfam_domain_info.pl " + \ | |
| 182 "--pfam_domains PFAM.domtblout.dat.gz " + \ | |
| 183 "--genome_lib_dir \"{:s}\"".format(ctat_genome_directory) | |
| 184 try: # to run the command. | |
| 185 command_output = subprocess.check_output(command, shell=True) | |
| 186 except subprocess.CalledProcessError as e: | |
| 187 print "ERROR: While trying to process the genome library:\n\t{:s}".format(command) | |
| 188 print "================================================" | |
| 189 print "\n\tOutput while running the command was:\n\n{:s}".format(e.output) | |
| 190 print "================================================" | |
| 191 raise | |
| 192 # end of post-processing for source_data files | |
| 193 | |
| 194 return ctat_genome_directory | |
| 195 | |
| 196 def main(): | |
| 197 #Parse Command Line | |
| 198 parser = argparse.ArgumentParser() | |
| 199 parser.add_argument('-d', '--download', default="", \ | |
| 200 help='Do not use if you already have a CTAT Resource Library that this program downloads.') | |
| 201 parser.add_argument('-g', '--genome_name', default="UNSPECIFIED_GenomeName", \ | |
| 202 help='Is used as the selector text of the entry in the data table.') | |
| 203 parser.add_argument('-p', '--destination_path', \ | |
| 204 help='Full path of the CTAT Resource Library location or destination.') | |
| 205 parser.add_argument('-o', '--output_filename', \ | |
| 206 help='Name of the output file, where the json dictionary will be written.') | |
| 207 args = parser.parse_args() | |
| 208 | |
| 209 # All of the input parameters are written by default to the json output file prior to | |
| 210 # this program being called. | |
| 211 # But I do not get input values from the json file, but rather from command line. | |
| 212 # Just leaving the following code as a comment, in case it might be useful to someone later. | |
| 213 # The target_directory is the typical location where data managers put their data, but then | |
| 214 # the typical model is to then copy it to the final location. With our files taking up so many | |
| 215 # GB of space, we don't want to move them around, but rather have the Galaxy Admin give us | |
| 216 # the final location (the destination_path) where the files will be placed (or currently reside). | |
| 217 # | |
| 218 # params = from_json_string(open(output_filename).read()) | |
| 219 # target_directory = params['output_data'][0]['extra_files_path'] | |
| 220 # os.mkdir(target_directory) | |
| 221 | |
| 222 if args.download != "": | |
| 223 ctat_genome_resource_lib_path = \ | |
| 224 download_from_BroadInst(src_filename=args.download, destination=args.destination_path) | |
| 225 else: | |
| 226 # FIX - probably should check if this is a valid path with an actual CTAT Genome Ref Lib there. | |
| 227 ctat_genome_resource_lib_path = args.destination_path | |
| 228 | |
| 229 if (args.genome_name is None) or (args.genome_name == ""): | |
| 230 genome_name = "GRCh38_gencode_v26" | |
| 231 else: | |
| 232 genome_name = args.genome_name | |
| 233 # Set the table_entry_value to the basename of the directory path minus the extension. | |
| 234 # FIX - Need to make sure is unique. This is not good way to do it. Just doing it this way now for testing. | |
| 235 table_entry_value = os.path.basename(ctat_genome_resource_lib_path).split(".")[0] | |
| 236 data_manager_dict = {} | |
| 237 data_manager_dict['data_tables'] = {} | |
| 238 data_manager_dict['data_tables']['ctat_genome_ref_libs'] = [] | |
| 239 data_table_entry = dict(value=table_entry_value, name=genome_name, path=ctat_genome_resource_lib_path) | |
| 240 data_manager_dict['data_tables']['ctat_genome_ref_libs'].append(data_table_entry) | |
| 241 | |
| 242 # Save info to json file. This is used to transfer data from the DataManager tool, to the data manager, | |
| 243 # which then puts it into the correct .loc file (I think). | |
| 244 open(args.output_filename, 'wb').write(to_json_string(data_manager_dict)) | |
| 245 | |
| 246 if __name__ == "__main__": | |
| 247 main() | |
| 248 |
