Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 30:6d9d7cdf00fc draft
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
| author | peterjc |
|---|---|
| date | Thu, 21 Sep 2017 11:15:55 -0400 |
| parents | 3cb02adf4326 |
| children | 20da7f48b56f |
comparison
equal
deleted
inserted
replaced
| 29:3cb02adf4326 | 30:6d9d7cdf00fc |
|---|---|
| 50 itself (see the SignalP XML file for settings). | 50 itself (see the SignalP XML file for settings). |
| 51 | 51 |
| 52 Finally, you can opt to have a GFF3 file produced which will describe the | 52 Finally, you can opt to have a GFF3 file produced which will describe the |
| 53 predicted signal peptide and mature peptide for each protein (using one of | 53 predicted signal peptide and mature peptide for each protein (using one of |
| 54 the predictors which gives a cleavage site). *WORK IN PROGRESS* | 54 the predictors which gives a cleavage site). *WORK IN PROGRESS* |
| 55 """ | 55 """ # noqa: E501 |
| 56 | |
| 57 from __future__ import print_function | |
| 58 | |
| 59 import os | |
| 56 import sys | 60 import sys |
| 57 import os | |
| 58 import tempfile | 61 import tempfile |
| 59 from seq_analysis_utils import split_fasta, fasta_iterator | 62 |
| 63 from seq_analysis_utils import fasta_iterator, split_fasta | |
| 60 from seq_analysis_utils import run_jobs, thread_count | 64 from seq_analysis_utils import run_jobs, thread_count |
| 61 | 65 |
| 62 FASTA_CHUNK = 500 | 66 FASTA_CHUNK = 500 |
| 63 MAX_LEN = 6000 # Found by trial and error | 67 MAX_LEN = 6000 # Found by trial and error |
| 68 | |
| 69 if "-v" in sys.argv or "--version" in sys.argv: | |
| 70 print("SignalP Galaxy wrapper version 0.0.19") | |
| 71 sys.exit(os.system("signalp -version")) | |
| 64 | 72 |
| 65 if len(sys.argv) not in [6, 8]: | 73 if len(sys.argv) not in [6, 8]: |
| 66 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " | 74 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " |
| 67 "input protein FASTA file & output tabular file (plus " | 75 "input protein FASTA file & output tabular file (plus " |
| 68 "optionally cut method and GFF3 output file). " | 76 "optionally cut method and GFF3 output file). " |
| 94 | 102 |
| 95 | 103 |
| 96 tmp_dir = tempfile.mkdtemp() | 104 tmp_dir = tempfile.mkdtemp() |
| 97 | 105 |
| 98 | 106 |
| 99 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): | 107 def clean_tabular(raw_handle, out_handle, gff_handle=None): |
| 100 """Clean up SignalP output to make it tabular.""" | 108 """Clean up SignalP output to make it tabular.""" |
| 101 if cut_method: | |
| 102 cut_col = {"NN_Cmax": 2, | |
| 103 "NN_Ymax": 5, | |
| 104 "NN_Smax": 8, | |
| 105 "HMM_Cmax": 16}[cut_method] | |
| 106 else: | |
| 107 cut_col = None | |
| 108 for line in raw_handle: | 109 for line in raw_handle: |
| 109 if not line or line.startswith("#"): | 110 if not line or line.startswith("#"): |
| 110 continue | 111 continue |
| 111 parts = line.rstrip("\r\n").split() | 112 parts = line.rstrip("\r\n").split() |
| 112 assert len(parts) == 21, repr(line) | 113 assert len(parts) == 21, repr(line) |
| 117 parts = parts[14:15] + parts[1:14] + parts[15:] | 118 parts = parts[14:15] + parts[1:14] + parts[15:] |
| 118 out_handle.write("\t".join(parts) + "\n") | 119 out_handle.write("\t".join(parts) + "\n") |
| 119 | 120 |
| 120 | 121 |
| 121 def make_gff(fasta_file, tabular_file, gff_file, cut_method): | 122 def make_gff(fasta_file, tabular_file, gff_file, cut_method): |
| 123 """Make a GFF file.""" | |
| 122 cut_col, score_col = {"NN_Cmax": (2, 1), | 124 cut_col, score_col = {"NN_Cmax": (2, 1), |
| 123 "NN_Ymax": (5, 4), | 125 "NN_Ymax": (5, 4), |
| 124 "NN_Smax": (8, 7), | 126 "NN_Smax": (8, 7), |
| 125 "HMM_Cmax": (16, 15), | 127 "HMM_Cmax": (16, 15), |
| 126 }[cut_method] | 128 }[cut_method] |
| 150 # TODO - Why does it do this? | 152 # TODO - Why does it do this? |
| 151 cut = 1 | 153 cut = 1 |
| 152 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) | 154 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) |
| 153 score = parts[score_col] | 155 score = parts[score_col] |
| 154 gff_handle.write("##sequence-region %s %i %i\n" | 156 gff_handle.write("##sequence-region %s %i %i\n" |
| 155 % (seqid, 1, len(seq))) | 157 % (seqid, 1, len(seq))) |
| 156 # If the cut is at the very begining, there is no signal peptide! | 158 # If the cut is at the very begining, there is no signal peptide! |
| 157 if cut > 1: | 159 if cut > 1: |
| 158 # signal_peptide = SO:0000418 | 160 # signal_peptide = SO:0000418 |
| 159 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" | 161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" |
| 160 % (seqid, source, | 162 % (seqid, source, |
| 186 try: | 188 try: |
| 187 os.rmdir(tmp_dir) | 189 os.rmdir(tmp_dir) |
| 188 except Exception: | 190 except Exception: |
| 189 pass | 191 pass |
| 190 | 192 |
| 193 | |
| 191 if len(jobs) > 1 and num_threads > 1: | 194 if len(jobs) > 1 and num_threads > 1: |
| 192 # A small "info" message for Galaxy to show the user. | 195 # A small "info" message for Galaxy to show the user. |
| 193 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 196 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
| 194 results = run_jobs(jobs, num_threads) | 197 results = run_jobs(jobs, num_threads) |
| 195 assert len(fasta_files) == len(temp_files) == len(jobs) | 198 assert len(fasta_files) == len(temp_files) == len(jobs) |
| 196 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 199 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): |
| 197 error_level = results[cmd] | 200 error_level = results[cmd] |
| 198 try: | 201 try: |
| 199 output = open(temp).readline() | 202 output = open(temp).readline() |
| 200 except IOError: | 203 except IOError: |
| 201 output = "(no output)" | 204 output = "(no output)" |
| 202 if error_level or output.lower().startswith("error running"): | 205 if error_level or output.lower().startswith("error running"): |
| 203 clean_up(fasta_files + temp_files) | 206 clean_up(fasta_files + temp_files) |
| 204 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 207 if output: |
| 205 error_level) | 208 sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output)) |
| 209 else: | |
| 210 sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd)) | |
| 211 sys.exit(error_level) | |
| 206 del results | 212 del results |
| 207 | 213 |
| 208 out_handle = open(tabular_file, "w") | 214 out_handle = open(tabular_file, "w") |
| 209 fields = ["ID"] | 215 fields = ["ID"] |
| 210 # NN results: | 216 # NN results: |
