Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/promoter2.py @ 30:6d9d7cdf00fc draft
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
| author | peterjc | 
|---|---|
| date | Thu, 21 Sep 2017 11:15:55 -0400 | 
| parents | 3cb02adf4326 | 
| children | 20da7f48b56f | 
   comparison
  equal
  deleted
  inserted
  replaced
| 29:3cb02adf4326 | 30:6d9d7cdf00fc | 
|---|---|
| 16 tab separated table. | 16 tab separated table. | 
| 17 | 17 | 
| 18 Additionally, in order to take advantage of multiple cores the input FASTA | 18 Additionally, in order to take advantage of multiple cores the input FASTA | 
| 19 file is broken into chunks and multiple copies of promoter run at once. | 19 file is broken into chunks and multiple copies of promoter run at once. | 
| 20 This can be used in combination with the job-splitting available in Galaxy. | 20 This can be used in combination with the job-splitting available in Galaxy. | 
| 21 | |
| 22 Note that rewriting the FASTA input file allows us to avoid a bug in | 21 Note that rewriting the FASTA input file allows us to avoid a bug in | 
| 23 promoter 2 with long descriptions in the FASTA header line (over 200 | 22 promoter 2 with long descriptions in the FASTA header line (over 200 | 
| 24 characters) which produces stray fragements of the description in the | 23 characters) which produces stray fragements of the description in the | 
| 25 output file, making parsing non-trivial. | 24 output file, making parsing non-trivial. | 
| 26 | 25 | 
| 27 TODO - Automatically extract the sequence containing a promoter prediction? | 26 TODO - Automatically extract the sequence containing a promoter prediction? | 
| 28 """ | 27 """ | 
| 28 | |
| 29 from __future__ import print_function | |
| 30 | |
| 31 import commands | |
| 32 import os | |
| 29 import sys | 33 import sys | 
| 30 import os | |
| 31 import commands | |
| 32 import tempfile | 34 import tempfile | 
| 33 from seq_analysis_utils import split_fasta, run_jobs, thread_count | 35 | 
| 36 from seq_analysis_utils import run_jobs, split_fasta, thread_count | |
| 34 | 37 | 
| 35 FASTA_CHUNK = 500 | 38 FASTA_CHUNK = 500 | 
| 36 | 39 | 
| 37 if "-v" in sys.argv or "--version" in sys.argv: | 40 if "-v" in sys.argv or "--version" in sys.argv: | 
| 38 sys.exit(os.system("promoter -V")) | 41 sys.exit(os.system("promoter -V")) | 
| 47 | 50 | 
| 48 tmp_dir = tempfile.mkdtemp() | 51 tmp_dir = tempfile.mkdtemp() | 
| 49 | 52 | 
| 50 | 53 | 
| 51 def get_path_and_binary(): | 54 def get_path_and_binary(): | 
| 55 """Determine path and binary names for promoter tool.""" | |
| 52 platform = commands.getoutput("uname") # e.g. Linux | 56 platform = commands.getoutput("uname") # e.g. Linux | 
| 53 shell_script = commands.getoutput("which promoter") | 57 shell_script = commands.getoutput("which promoter") | 
| 54 if not os.path.isfile(shell_script): | 58 if not os.path.isfile(shell_script): | 
| 55 sys.exit("ERROR: Missing promoter executable shell script") | 59 sys.exit("ERROR: Missing promoter executable shell script") | 
| 56 path = None | 60 path = None | 
| 72 def make_tabular(raw_handle, out_handle): | 76 def make_tabular(raw_handle, out_handle): | 
| 73 """Parse text output into tabular, return query count.""" | 77 """Parse text output into tabular, return query count.""" | 
| 74 identifier = None | 78 identifier = None | 
| 75 queries = 0 | 79 queries = 0 | 
| 76 for line in raw_handle: | 80 for line in raw_handle: | 
| 77 # print repr(line) | 81 # print(repr(line)) | 
| 78 if not line.strip() or line == "Promoter prediction:\n": | 82 if not line.strip() or line == "Promoter prediction:\n": | 
| 79 pass | 83 pass | 
| 80 elif line[0] != " ": | 84 elif line[0] != " ": | 
| 81 identifier = line.strip().replace("\t", " ").split(None, 1)[0] | 85 identifier = line.strip().replace("\t", " ").split(None, 1)[0] | 
| 82 queries += 1 | 86 queries += 1 | 
| 87 assert identifier | 91 assert identifier | 
| 88 else: | 92 else: | 
| 89 try: | 93 try: | 
| 90 position, score, likelihood = line.strip().split(None, 2) | 94 position, score, likelihood = line.strip().split(None, 2) | 
| 91 except ValueError: | 95 except ValueError: | 
| 92 print "WARNING: Problem with line: %r" % line | 96 print("WARNING: Problem with line: %r" % line) | 
| 93 continue | 97 continue | 
| 94 # sys.exit("ERROR: Problem with line: %r" % line) | 98 # sys.exit("ERROR: Problem with line: %r" % line) | 
| 95 if likelihood not in ["ignored", | 99 if likelihood not in ["ignored", | 
| 96 "Marginal prediction", | 100 "Marginal prediction", | 
| 97 "Medium likely prediction", | 101 "Medium likely prediction", | 
| 98 "Highly likely prediction"]: | 102 "Highly likely prediction"]: | 
| 99 sys.exit("ERROR: Problem with line: %r" % line) | 103 sys.exit("ERROR: Problem with line: %r" % line) | 
| 100 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) | 104 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) | 
| 101 return queries | 105 return queries | 
| 106 | |
| 102 | 107 | 
| 103 working_dir, bin = get_path_and_binary() | 108 working_dir, bin = get_path_and_binary() | 
| 104 | 109 | 
| 105 if not os.path.isfile(fasta_file): | 110 if not os.path.isfile(fasta_file): | 
| 106 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) | 111 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) | 
| 122 try: | 127 try: | 
| 123 os.rmdir(tmp_dir) | 128 os.rmdir(tmp_dir) | 
| 124 except Exception: | 129 except Exception: | 
| 125 pass | 130 pass | 
| 126 | 131 | 
| 132 | |
| 127 if len(jobs) > 1 and num_threads > 1: | 133 if len(jobs) > 1 and num_threads > 1: | 
| 128 # A small "info" message for Galaxy to show the user. | 134 # A small "info" message for Galaxy to show the user. | 
| 129 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 135 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) | 
| 130 cur_dir = os.path.abspath(os.curdir) | 136 cur_dir = os.path.abspath(os.curdir) | 
| 131 os.chdir(working_dir) | 137 os.chdir(working_dir) | 
| 132 results = run_jobs(jobs, num_threads) | 138 results = run_jobs(jobs, num_threads) | 
| 133 os.chdir(cur_dir) | 139 os.chdir(cur_dir) | 
| 134 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 140 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 
| 157 sys.exit("No output from promoter2") | 163 sys.exit("No output from promoter2") | 
| 158 queries += count | 164 queries += count | 
| 159 out_handle.close() | 165 out_handle.close() | 
| 160 | 166 | 
| 161 clean_up(fasta_files + temp_files) | 167 clean_up(fasta_files + temp_files) | 
| 162 print "Results for %i queries" % queries | 168 print("Results for %i queries" % queries) | 
