Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/promoter2.py @ 28:22e71e53f534 draft
Capture version of promoter2
author | peterjc |
---|---|
date | Tue, 01 Sep 2015 08:24:49 -0400 |
parents | 20139cb4c844 |
children | 3cb02adf4326 |
rev | line source |
---|---|
7 | 1 #!/usr/bin/env python |
2 """Wrapper for Promoter 2.0 for use in Galaxy. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * number of threads | |
6 * an input DNA FASTA filename | |
7 * output tabular filename. | |
8 | |
9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | |
10 bypassing the Perl wrapper script 'promoter' which imposes a significant | |
11 performace overhead for no benefit here (we don't need HTML output for | |
12 example). | |
13 | |
14 The main feature is this Python wrapper script parsers the bespoke | |
15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | |
16 tab separated table. | |
17 | |
18 Additionally, in order to take advantage of multiple cores the input FASTA | |
19 file is broken into chunks and multiple copies of promoter run at once. | |
20 This can be used in combination with the job-splitting available in Galaxy. | |
21 | |
22 Note that rewriting the FASTA input file allows us to avoid a bug in | |
23 promoter 2 with long descriptions in the FASTA header line (over 200 | |
24 characters) which produces stray fragements of the description in the | |
25 output file, making parsing non-trivial. | |
26 | |
27 TODO - Automatically extract the sequence containing a promoter prediction? | |
28 """ | |
29 import sys | |
30 import os | |
31 import commands | |
32 import tempfile | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
33 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count |
7 | 34 |
35 FASTA_CHUNK = 500 | |
36 | |
28 | 37 if "-v" in sys.argv or "--version" in sys.argv: |
38 sys.exit(os.system("promoter -V")) | |
39 | |
7 | 40 if len(sys.argv) != 4: |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
41 sys_exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " |
7 | 42 "Got %i arguments." % (len(sys.argv)-1)) |
43 | |
44 num_threads = thread_count(sys.argv[3],default=4) | |
45 fasta_file = os.path.abspath(sys.argv[2]) | |
46 tabular_file = os.path.abspath(sys.argv[3]) | |
47 | |
48 tmp_dir = tempfile.mkdtemp() | |
49 | |
50 def get_path_and_binary(): | |
51 platform = commands.getoutput("uname") #e.g. Linux | |
52 shell_script = commands.getoutput("which promoter") | |
53 if not os.path.isfile(shell_script): | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
54 sys_exit("ERROR: Missing promoter executable shell script") |
7 | 55 path = None |
56 for line in open(shell_script): | |
57 if line.startswith("setenv"): #could then be tab or space! | |
58 parts = line.rstrip().split(None, 2) | |
59 if parts[0] == "setenv" and parts[1] == "PROM": | |
60 path = parts[2] | |
61 if not path: | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
62 sys_exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) |
7 | 63 if not os.path.isdir(path): |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
64 sys_exit("ERROR: %r is not a directory" % path) |
7 | 65 bin = "%s/bin/promoter_%s" % (path, platform) |
66 if not os.path.isfile(bin): | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
67 sys_exit("ERROR: Missing promoter binary %r" % bin) |
7 | 68 return path, bin |
69 | |
70 def make_tabular(raw_handle, out_handle): | |
71 """Parse text output into tabular, return query count.""" | |
72 identifier = None | |
73 queries = 0 | |
74 for line in raw_handle: | |
75 #print repr(line) | |
76 if not line.strip() or line == "Promoter prediction:\n": | |
77 pass | |
78 elif line[0] != " ": | |
79 identifier = line.strip().replace("\t", " ").split(None,1)[0] | |
80 queries += 1 | |
81 elif line == " No promoter predicted\n": | |
82 #End of a record | |
83 identifier = None | |
84 elif line == " Position Score Likelihood\n": | |
85 assert identifier | |
86 else: | |
87 try: | |
88 position, score, likelihood = line.strip().split(None,2) | |
89 except ValueError: | |
90 print "WARNING: Problem with line: %r" % line | |
91 continue | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
92 #sys_exit("ERROR: Problem with line: %r" % line) |
7 | 93 if likelihood not in ["ignored", |
94 "Marginal prediction", | |
95 "Medium likely prediction", | |
96 "Highly likely prediction"]: | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
97 sys_exit("ERROR: Problem with line: %r" % line) |
7 | 98 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) |
99 return queries | |
100 | |
101 working_dir, bin = get_path_and_binary() | |
102 | |
103 if not os.path.isfile(fasta_file): | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
104 sys_exit("ERROR: Missing input FASTA file %r" % fasta_file) |
7 | 105 |
106 #Note that if the input FASTA file contains no sequences, | |
107 #split_fasta returns an empty list (i.e. zero temp files). | |
108 #We deliberately omit the FASTA descriptions to avoid a | |
109 #bug in promoter2 with descriptions over 200 characters. | |
110 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) | |
111 temp_files = [f+".out" for f in fasta_files] | |
112 jobs = ["%s %s > %s" % (bin, fasta, temp) | |
113 for fasta, temp in zip(fasta_files, temp_files)] | |
114 | |
115 def clean_up(file_list): | |
116 for f in file_list: | |
117 if os.path.isfile(f): | |
118 os.remove(f) | |
119 try: | |
120 os.rmdir(tmp_dir) | |
121 except: | |
122 pass | |
123 | |
124 if len(jobs) > 1 and num_threads > 1: | |
125 #A small "info" message for Galaxy to show the user. | |
126 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
127 cur_dir = os.path.abspath(os.curdir) | |
128 os.chdir(working_dir) | |
129 results = run_jobs(jobs, num_threads) | |
130 os.chdir(cur_dir) | |
131 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
132 error_level = results[cmd] | |
133 if error_level: | |
134 try: | |
135 output = open(temp).readline() | |
136 except IOError: | |
137 output = "" | |
138 clean_up(fasta_files + temp_files) | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
139 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
7 | 140 error_level) |
141 | |
142 del results | |
143 del jobs | |
144 | |
145 out_handle = open(tabular_file, "w") | |
146 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | |
147 queries = 0 | |
148 for temp in temp_files: | |
149 data_handle = open(temp) | |
150 count = make_tabular(data_handle, out_handle) | |
151 data_handle.close() | |
152 if not count: | |
153 clean_up(fasta_files + temp_files) | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
7
diff
changeset
|
154 sys_exit("No output from promoter2") |
7 | 155 queries += count |
156 out_handle.close() | |
157 | |
158 clean_up(fasta_files + temp_files) | |
159 print "Results for %i queries" % queries |