| 7 | 1 #!/usr/bin/env python | 
|  | 2 """Wrapper for Promoter 2.0 for use in Galaxy. | 
|  | 3 | 
|  | 4 This script takes exactly three command line arguments: | 
|  | 5  * number of threads | 
|  | 6  * an input DNA FASTA filename | 
|  | 7  * output tabular filename. | 
|  | 8 | 
|  | 9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | 
|  | 10 bypassing the Perl wrapper script 'promoter' which imposes a significant | 
|  | 11 performace overhead for no benefit here (we don't need HTML output for | 
|  | 12 example). | 
|  | 13 | 
|  | 14 The main feature is this Python wrapper script parsers the bespoke | 
|  | 15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | 
|  | 16 tab separated table. | 
|  | 17 | 
|  | 18 Additionally, in order to take advantage of multiple cores the input FASTA | 
|  | 19 file is broken into chunks and multiple copies of promoter run at once. | 
|  | 20 This can be used in combination with the job-splitting available in Galaxy. | 
|  | 21 | 
|  | 22 Note that rewriting the FASTA input file allows us to avoid a bug in | 
|  | 23 promoter 2 with long descriptions in the FASTA header line (over 200 | 
|  | 24 characters) which produces stray fragements of the description in the | 
|  | 25 output file, making parsing non-trivial. | 
|  | 26 | 
|  | 27 TODO - Automatically extract the sequence containing a promoter prediction? | 
|  | 28 """ | 
|  | 29 import sys | 
|  | 30 import os | 
|  | 31 import commands | 
|  | 32 import tempfile | 
|  | 33 from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count | 
|  | 34 | 
|  | 35 FASTA_CHUNK = 500 | 
|  | 36 | 
|  | 37 if len(sys.argv) != 4: | 
|  | 38     stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " | 
|  | 39              "Got %i arguments." % (len(sys.argv)-1)) | 
|  | 40 | 
|  | 41 num_threads = thread_count(sys.argv[3],default=4) | 
|  | 42 fasta_file = os.path.abspath(sys.argv[2]) | 
|  | 43 tabular_file = os.path.abspath(sys.argv[3]) | 
|  | 44 | 
|  | 45 tmp_dir = tempfile.mkdtemp() | 
|  | 46 | 
|  | 47 def get_path_and_binary(): | 
|  | 48     platform = commands.getoutput("uname") #e.g. Linux | 
|  | 49     shell_script = commands.getoutput("which promoter") | 
|  | 50     if not os.path.isfile(shell_script): | 
|  | 51         stop_err("ERROR: Missing promoter executable shell script") | 
|  | 52     path = None | 
|  | 53     for line in open(shell_script): | 
|  | 54         if line.startswith("setenv"): #could then be tab or space! | 
|  | 55             parts = line.rstrip().split(None, 2) | 
|  | 56             if parts[0] == "setenv" and parts[1] == "PROM": | 
|  | 57                 path = parts[2] | 
|  | 58     if not path: | 
|  | 59         stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script) | 
|  | 60     if not os.path.isdir(path): | 
|  | 61         stop_error("ERROR: %r is not a directory" % path) | 
|  | 62     bin = "%s/bin/promoter_%s" % (path, platform) | 
|  | 63     if not os.path.isfile(bin): | 
|  | 64         stop_err("ERROR: Missing promoter binary %r" % bin) | 
|  | 65     return path, bin | 
|  | 66 | 
|  | 67 def make_tabular(raw_handle, out_handle): | 
|  | 68     """Parse text output into tabular, return query count.""" | 
|  | 69     identifier = None | 
|  | 70     queries = 0 | 
|  | 71     for line in raw_handle: | 
|  | 72         #print repr(line) | 
|  | 73         if not line.strip() or line == "Promoter prediction:\n": | 
|  | 74             pass | 
|  | 75         elif line[0] != " ": | 
|  | 76             identifier = line.strip().replace("\t", " ").split(None,1)[0] | 
|  | 77             queries += 1 | 
|  | 78         elif line == "  No promoter predicted\n": | 
|  | 79             #End of a record | 
|  | 80             identifier = None | 
|  | 81         elif line == "  Position  Score  Likelihood\n": | 
|  | 82             assert identifier | 
|  | 83         else: | 
|  | 84             try: | 
|  | 85                 position, score, likelihood = line.strip().split(None,2) | 
|  | 86             except ValueError: | 
|  | 87                 print "WARNING: Problem with line: %r" % line | 
|  | 88                 continue | 
|  | 89                 #stop_err("ERROR: Problem with line: %r" % line) | 
|  | 90             if likelihood not in ["ignored", | 
|  | 91                                   "Marginal prediction", | 
|  | 92                                   "Medium likely prediction", | 
|  | 93                                   "Highly likely prediction"]: | 
|  | 94                 stop_err("ERROR: Problem with line: %r" % line) | 
|  | 95             out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) | 
|  | 96     return queries | 
|  | 97 | 
|  | 98 working_dir, bin = get_path_and_binary() | 
|  | 99 | 
|  | 100 if not os.path.isfile(fasta_file): | 
|  | 101    stop_err("ERROR: Missing input FASTA file %r" % fasta_file) | 
|  | 102 | 
|  | 103 #Note that if the input FASTA file contains no sequences, | 
|  | 104 #split_fasta returns an empty list (i.e. zero temp files). | 
|  | 105 #We deliberately omit the FASTA descriptions to avoid a | 
|  | 106 #bug in promoter2 with descriptions over 200 characters. | 
|  | 107 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) | 
|  | 108 temp_files = [f+".out" for f in fasta_files] | 
|  | 109 jobs = ["%s %s > %s" % (bin, fasta, temp) | 
|  | 110         for fasta, temp in zip(fasta_files, temp_files)] | 
|  | 111 | 
|  | 112 def clean_up(file_list): | 
|  | 113     for f in file_list: | 
|  | 114         if os.path.isfile(f): | 
|  | 115             os.remove(f) | 
|  | 116     try: | 
|  | 117         os.rmdir(tmp_dir) | 
|  | 118     except: | 
|  | 119         pass | 
|  | 120 | 
|  | 121 if len(jobs) > 1 and num_threads > 1: | 
|  | 122     #A small "info" message for Galaxy to show the user. | 
|  | 123     print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 
|  | 124 cur_dir = os.path.abspath(os.curdir) | 
|  | 125 os.chdir(working_dir) | 
|  | 126 results = run_jobs(jobs, num_threads) | 
|  | 127 os.chdir(cur_dir) | 
|  | 128 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 
|  | 129     error_level = results[cmd] | 
|  | 130     if error_level: | 
|  | 131         try: | 
|  | 132             output = open(temp).readline() | 
|  | 133         except IOError: | 
|  | 134             output = "" | 
|  | 135         clean_up(fasta_files + temp_files) | 
|  | 136         stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 
|  | 137                  error_level) | 
|  | 138 | 
|  | 139 del results | 
|  | 140 del jobs | 
|  | 141 | 
|  | 142 out_handle = open(tabular_file, "w") | 
|  | 143 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | 
|  | 144 queries = 0 | 
|  | 145 for temp in temp_files: | 
|  | 146     data_handle = open(temp) | 
|  | 147     count = make_tabular(data_handle, out_handle) | 
|  | 148     data_handle.close() | 
|  | 149     if not count: | 
|  | 150         clean_up(fasta_files + temp_files) | 
|  | 151         stop_err("No output from promoter2") | 
|  | 152     queries += count | 
|  | 153 out_handle.close() | 
|  | 154 | 
|  | 155 clean_up(fasta_files + temp_files) | 
|  | 156 print "Results for %i queries" % queries |