Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/promoter2.py @ 7:5e62aefb2918 draft
Uploaded v0.1.2 to Test Tool Shed
| author | peterjc |
|---|---|
| date | Tue, 26 Mar 2013 14:24:56 -0400 |
| parents | |
| children | 20139cb4c844 |
comparison
equal
deleted
inserted
replaced
| 6:39a6e46cdda3 | 7:5e62aefb2918 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Wrapper for Promoter 2.0 for use in Galaxy. | |
| 3 | |
| 4 This script takes exactly three command line arguments: | |
| 5 * number of threads | |
| 6 * an input DNA FASTA filename | |
| 7 * output tabular filename. | |
| 8 | |
| 9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | |
| 10 bypassing the Perl wrapper script 'promoter' which imposes a significant | |
| 11 performace overhead for no benefit here (we don't need HTML output for | |
| 12 example). | |
| 13 | |
| 14 The main feature is this Python wrapper script parsers the bespoke | |
| 15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | |
| 16 tab separated table. | |
| 17 | |
| 18 Additionally, in order to take advantage of multiple cores the input FASTA | |
| 19 file is broken into chunks and multiple copies of promoter run at once. | |
| 20 This can be used in combination with the job-splitting available in Galaxy. | |
| 21 | |
| 22 Note that rewriting the FASTA input file allows us to avoid a bug in | |
| 23 promoter 2 with long descriptions in the FASTA header line (over 200 | |
| 24 characters) which produces stray fragements of the description in the | |
| 25 output file, making parsing non-trivial. | |
| 26 | |
| 27 TODO - Automatically extract the sequence containing a promoter prediction? | |
| 28 """ | |
| 29 import sys | |
| 30 import os | |
| 31 import commands | |
| 32 import tempfile | |
| 33 from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count | |
| 34 | |
| 35 FASTA_CHUNK = 500 | |
| 36 | |
| 37 if len(sys.argv) != 4: | |
| 38 stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " | |
| 39 "Got %i arguments." % (len(sys.argv)-1)) | |
| 40 | |
| 41 num_threads = thread_count(sys.argv[3],default=4) | |
| 42 fasta_file = os.path.abspath(sys.argv[2]) | |
| 43 tabular_file = os.path.abspath(sys.argv[3]) | |
| 44 | |
| 45 tmp_dir = tempfile.mkdtemp() | |
| 46 | |
| 47 def get_path_and_binary(): | |
| 48 platform = commands.getoutput("uname") #e.g. Linux | |
| 49 shell_script = commands.getoutput("which promoter") | |
| 50 if not os.path.isfile(shell_script): | |
| 51 stop_err("ERROR: Missing promoter executable shell script") | |
| 52 path = None | |
| 53 for line in open(shell_script): | |
| 54 if line.startswith("setenv"): #could then be tab or space! | |
| 55 parts = line.rstrip().split(None, 2) | |
| 56 if parts[0] == "setenv" and parts[1] == "PROM": | |
| 57 path = parts[2] | |
| 58 if not path: | |
| 59 stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script) | |
| 60 if not os.path.isdir(path): | |
| 61 stop_error("ERROR: %r is not a directory" % path) | |
| 62 bin = "%s/bin/promoter_%s" % (path, platform) | |
| 63 if not os.path.isfile(bin): | |
| 64 stop_err("ERROR: Missing promoter binary %r" % bin) | |
| 65 return path, bin | |
| 66 | |
| 67 def make_tabular(raw_handle, out_handle): | |
| 68 """Parse text output into tabular, return query count.""" | |
| 69 identifier = None | |
| 70 queries = 0 | |
| 71 for line in raw_handle: | |
| 72 #print repr(line) | |
| 73 if not line.strip() or line == "Promoter prediction:\n": | |
| 74 pass | |
| 75 elif line[0] != " ": | |
| 76 identifier = line.strip().replace("\t", " ").split(None,1)[0] | |
| 77 queries += 1 | |
| 78 elif line == " No promoter predicted\n": | |
| 79 #End of a record | |
| 80 identifier = None | |
| 81 elif line == " Position Score Likelihood\n": | |
| 82 assert identifier | |
| 83 else: | |
| 84 try: | |
| 85 position, score, likelihood = line.strip().split(None,2) | |
| 86 except ValueError: | |
| 87 print "WARNING: Problem with line: %r" % line | |
| 88 continue | |
| 89 #stop_err("ERROR: Problem with line: %r" % line) | |
| 90 if likelihood not in ["ignored", | |
| 91 "Marginal prediction", | |
| 92 "Medium likely prediction", | |
| 93 "Highly likely prediction"]: | |
| 94 stop_err("ERROR: Problem with line: %r" % line) | |
| 95 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) | |
| 96 return queries | |
| 97 | |
| 98 working_dir, bin = get_path_and_binary() | |
| 99 | |
| 100 if not os.path.isfile(fasta_file): | |
| 101 stop_err("ERROR: Missing input FASTA file %r" % fasta_file) | |
| 102 | |
| 103 #Note that if the input FASTA file contains no sequences, | |
| 104 #split_fasta returns an empty list (i.e. zero temp files). | |
| 105 #We deliberately omit the FASTA descriptions to avoid a | |
| 106 #bug in promoter2 with descriptions over 200 characters. | |
| 107 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) | |
| 108 temp_files = [f+".out" for f in fasta_files] | |
| 109 jobs = ["%s %s > %s" % (bin, fasta, temp) | |
| 110 for fasta, temp in zip(fasta_files, temp_files)] | |
| 111 | |
| 112 def clean_up(file_list): | |
| 113 for f in file_list: | |
| 114 if os.path.isfile(f): | |
| 115 os.remove(f) | |
| 116 try: | |
| 117 os.rmdir(tmp_dir) | |
| 118 except: | |
| 119 pass | |
| 120 | |
| 121 if len(jobs) > 1 and num_threads > 1: | |
| 122 #A small "info" message for Galaxy to show the user. | |
| 123 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
| 124 cur_dir = os.path.abspath(os.curdir) | |
| 125 os.chdir(working_dir) | |
| 126 results = run_jobs(jobs, num_threads) | |
| 127 os.chdir(cur_dir) | |
| 128 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
| 129 error_level = results[cmd] | |
| 130 if error_level: | |
| 131 try: | |
| 132 output = open(temp).readline() | |
| 133 except IOError: | |
| 134 output = "" | |
| 135 clean_up(fasta_files + temp_files) | |
| 136 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | |
| 137 error_level) | |
| 138 | |
| 139 del results | |
| 140 del jobs | |
| 141 | |
| 142 out_handle = open(tabular_file, "w") | |
| 143 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | |
| 144 queries = 0 | |
| 145 for temp in temp_files: | |
| 146 data_handle = open(temp) | |
| 147 count = make_tabular(data_handle, out_handle) | |
| 148 data_handle.close() | |
| 149 if not count: | |
| 150 clean_up(fasta_files + temp_files) | |
| 151 stop_err("No output from promoter2") | |
| 152 queries += count | |
| 153 out_handle.close() | |
| 154 | |
| 155 clean_up(fasta_files + temp_files) | |
| 156 print "Results for %i queries" % queries |
