Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/promoter2.py @ 34:7a2e20baacee draft default tip
"v0.2.13 - Python 3 fix for raising StopIteration"
| author | peterjc |
|---|---|
| date | Thu, 17 Jun 2021 17:58:23 +0000 |
| parents | 20da7f48b56f |
| children |
| rev | line source |
|---|---|
| 7 | 1 #!/usr/bin/env python |
| 2 """Wrapper for Promoter 2.0 for use in Galaxy. | |
| 3 | |
| 4 This script takes exactly three command line arguments: | |
| 5 * number of threads | |
| 6 * an input DNA FASTA filename | |
| 7 * output tabular filename. | |
| 8 | |
| 9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | |
| 10 bypassing the Perl wrapper script 'promoter' which imposes a significant | |
| 11 performace overhead for no benefit here (we don't need HTML output for | |
| 12 example). | |
| 13 | |
| 14 The main feature is this Python wrapper script parsers the bespoke | |
| 15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | |
| 16 tab separated table. | |
| 17 | |
| 18 Additionally, in order to take advantage of multiple cores the input FASTA | |
| 19 file is broken into chunks and multiple copies of promoter run at once. | |
| 20 This can be used in combination with the job-splitting available in Galaxy. | |
| 21 Note that rewriting the FASTA input file allows us to avoid a bug in | |
| 22 promoter 2 with long descriptions in the FASTA header line (over 200 | |
| 23 characters) which produces stray fragements of the description in the | |
| 24 output file, making parsing non-trivial. | |
| 25 | |
| 26 TODO - Automatically extract the sequence containing a promoter prediction? | |
| 27 """ | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
28 |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
29 from __future__ import print_function |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
30 |
| 7 | 31 import commands |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
32 import os |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
33 import sys |
| 7 | 34 import tempfile |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
35 |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
36 from seq_analysis_utils import run_jobs, split_fasta, thread_count |
| 7 | 37 |
| 38 FASTA_CHUNK = 500 | |
| 39 | |
| 28 | 40 if "-v" in sys.argv or "--version" in sys.argv: |
| 41 sys.exit(os.system("promoter -V")) | |
| 42 | |
| 7 | 43 if len(sys.argv) != 4: |
| 32 | 44 sys.exit( |
| 45 "Require three arguments, number of threads (int), input DNA FASTA " | |
| 46 "file & output tabular file. Got %i arguments." % (len(sys.argv) - 1) | |
| 47 ) | |
| 7 | 48 |
| 29 | 49 num_threads = thread_count(sys.argv[3], default=4) |
| 7 | 50 fasta_file = os.path.abspath(sys.argv[2]) |
| 51 tabular_file = os.path.abspath(sys.argv[3]) | |
| 52 | |
| 53 tmp_dir = tempfile.mkdtemp() | |
| 54 | |
| 29 | 55 |
| 7 | 56 def get_path_and_binary(): |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
57 """Determine path and binary names for promoter tool.""" |
| 29 | 58 platform = commands.getoutput("uname") # e.g. Linux |
| 7 | 59 shell_script = commands.getoutput("which promoter") |
| 60 if not os.path.isfile(shell_script): | |
| 29 | 61 sys.exit("ERROR: Missing promoter executable shell script") |
| 7 | 62 path = None |
| 63 for line in open(shell_script): | |
| 29 | 64 if line.startswith("setenv"): # could then be tab or space! |
| 7 | 65 parts = line.rstrip().split(None, 2) |
| 66 if parts[0] == "setenv" and parts[1] == "PROM": | |
| 67 path = parts[2] | |
| 68 if not path: | |
| 29 | 69 sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) |
| 7 | 70 if not os.path.isdir(path): |
| 29 | 71 sys.exit("ERROR: %r is not a directory" % path) |
| 7 | 72 bin = "%s/bin/promoter_%s" % (path, platform) |
| 73 if not os.path.isfile(bin): | |
| 29 | 74 sys.exit("ERROR: Missing promoter binary %r" % bin) |
| 7 | 75 return path, bin |
| 76 | |
| 29 | 77 |
| 7 | 78 def make_tabular(raw_handle, out_handle): |
| 79 """Parse text output into tabular, return query count.""" | |
| 80 identifier = None | |
| 81 queries = 0 | |
| 82 for line in raw_handle: | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
83 # print(repr(line)) |
| 7 | 84 if not line.strip() or line == "Promoter prediction:\n": |
| 85 pass | |
| 86 elif line[0] != " ": | |
| 29 | 87 identifier = line.strip().replace("\t", " ").split(None, 1)[0] |
| 7 | 88 queries += 1 |
| 89 elif line == " No promoter predicted\n": | |
| 29 | 90 # End of a record |
| 7 | 91 identifier = None |
| 92 elif line == " Position Score Likelihood\n": | |
| 93 assert identifier | |
| 94 else: | |
| 95 try: | |
| 29 | 96 position, score, likelihood = line.strip().split(None, 2) |
| 7 | 97 except ValueError: |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
98 print("WARNING: Problem with line: %r" % line) |
| 7 | 99 continue |
| 29 | 100 # sys.exit("ERROR: Problem with line: %r" % line) |
| 32 | 101 if likelihood not in [ |
| 102 "ignored", | |
| 103 "Marginal prediction", | |
| 104 "Medium likely prediction", | |
| 105 "Highly likely prediction", | |
| 106 ]: | |
| 29 | 107 sys.exit("ERROR: Problem with line: %r" % line) |
| 32 | 108 out_handle.write( |
| 109 "%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood) | |
| 110 ) | |
| 7 | 111 return queries |
| 29 | 112 |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
113 |
| 7 | 114 working_dir, bin = get_path_and_binary() |
| 115 | |
| 116 if not os.path.isfile(fasta_file): | |
| 29 | 117 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) |
| 7 | 118 |
| 29 | 119 # Note that if the input FASTA file contains no sequences, |
| 120 # split_fasta returns an empty list (i.e. zero temp files). | |
| 121 # We deliberately omit the FASTA descriptions to avoid a | |
| 122 # bug in promoter2 with descriptions over 200 characters. | |
| 32 | 123 fasta_files = split_fasta( |
| 124 fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False | |
| 125 ) | |
| 29 | 126 temp_files = [f + ".out" for f in fasta_files] |
| 32 | 127 jobs = [ |
| 128 "%s %s > %s" % (bin, fasta, temp) for fasta, temp in zip(fasta_files, temp_files) | |
| 129 ] | |
| 7 | 130 |
| 29 | 131 |
| 7 | 132 def clean_up(file_list): |
| 133 for f in file_list: | |
| 134 if os.path.isfile(f): | |
| 135 os.remove(f) | |
| 136 try: | |
| 137 os.rmdir(tmp_dir) | |
| 29 | 138 except Exception: |
| 7 | 139 pass |
| 140 | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
141 |
| 7 | 142 if len(jobs) > 1 and num_threads > 1: |
| 29 | 143 # A small "info" message for Galaxy to show the user. |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
144 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
| 7 | 145 cur_dir = os.path.abspath(os.curdir) |
| 146 os.chdir(working_dir) | |
| 147 results = run_jobs(jobs, num_threads) | |
| 148 os.chdir(cur_dir) | |
| 149 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
| 150 error_level = results[cmd] | |
| 151 if error_level: | |
| 152 try: | |
| 153 output = open(temp).readline() | |
| 154 except IOError: | |
| 155 output = "" | |
| 156 clean_up(fasta_files + temp_files) | |
| 32 | 157 sys.exit( |
| 158 "One or more tasks failed, e.g. %i from %r gave:\n%s" | |
| 159 % (error_level, cmd, output), | |
| 160 error_level, | |
| 161 ) | |
| 7 | 162 |
| 163 del results | |
| 164 del jobs | |
| 165 | |
| 166 out_handle = open(tabular_file, "w") | |
| 167 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | |
| 168 queries = 0 | |
| 169 for temp in temp_files: | |
| 170 data_handle = open(temp) | |
| 171 count = make_tabular(data_handle, out_handle) | |
| 172 data_handle.close() | |
| 173 if not count: | |
| 174 clean_up(fasta_files + temp_files) | |
| 29 | 175 sys.exit("No output from promoter2") |
| 7 | 176 queries += count |
| 177 out_handle.close() | |
| 178 | |
| 179 clean_up(fasta_files + temp_files) | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
180 print("Results for %i queries" % queries) |
