Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 0:a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
| author | peterjc |
|---|---|
| date | Tue, 07 Jun 2011 17:37:26 -0400 |
| parents | |
| children | ef7ceca37e3f |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a2eeeaa6f75e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Wrapper for SignalP v3.0 for use in Galaxy. | |
| 3 | |
| 4 This script takes exactly fives command line arguments: | |
| 5 * the organism type (euk, gram+ or gram-) | |
| 6 * length to truncate sequences to (integer) | |
| 7 * number of threads to use (integer) | |
| 8 * an input protein FASTA filename | |
| 9 * output tabular filename. | |
| 10 | |
| 11 It then calls the standalone SignalP v3.0 program (not the webservice) | |
| 12 requesting the short output (one line per protein) using both NN and HMM | |
| 13 for predictions. | |
| 14 | |
| 15 First major feature is cleaning up the output. The raw output from SignalP | |
| 16 v3.0 looks like this (21 columns space separated): | |
| 17 | |
| 18 # SignalP-NN euk predictions # SignalP-HMM euk predictions | |
| 19 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? | |
| 20 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N | |
| 21 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N | |
| 22 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N | |
| 23 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N | |
| 24 | |
| 25 In order to make it easier to use in Galaxy, this wrapper script reformats | |
| 26 this to use tab separators. Also it removes the redundant truncated name | |
| 27 column, and assigns unique column names in the header: | |
| 28 | |
| 29 #ID NN_Cmax_score NN_Cmax_pos NN_Cmax_pred NN_Ymax_score NN_Ymax_pos NN_Ymax_pred NN_Smax_score NN_Smax_pos NN_Smax_pred NN_Smean_score NN_Smean_pred NN_D_score NN_D_pred HMM_bang HMM_Cmax_score HMM_Cmax_pos HMM_Cmax_pred HMM_Sprob_score HMM_Sprob_pred | |
| 30 gi|2781234|pdb|1JLY|B 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N Q 0.000 17 N 0.000 N | |
| 31 gi|4959044|gb|AAD34209.1|AF069992_1 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N Q 0.000 0 N 0.000 N | |
| 32 gi|671626|emb|CAA85685.1| 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N Q 0.000 0 N 0.000 N | |
| 33 gi|3298468|dbj|BAA31520.1| 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N Q 0.066 24 N 0.139 N | |
| 34 | |
| 35 The second major feature is overcoming SignalP's built in limit of 4000 | |
| 36 sequences by breaking up the input FASTA file into chunks. This also allows | |
| 37 us to pre-trim the sequences since SignalP only needs their starts. | |
| 38 | |
| 39 The third major feature is taking advantage of multiple cores (since SignalP | |
| 40 v3.0 itself is single threaded) by using the individual FASTA input files to | |
| 41 run multiple copies of TMHMM in parallel. I would normally use Python's | |
| 42 multiprocessing library in this situation but it requires at least Python 2.6 | |
| 43 and at the time of writing Galaxy still supports Python 2.4. | |
| 44 """ | |
| 45 import sys | |
| 46 import os | |
| 47 from seq_analysis_utils import stop_err, split_fasta, run_jobs | |
| 48 | |
| 49 FASTA_CHUNK = 500 | |
| 50 MAX_LEN = 6000 #Found by trial and error | |
| 51 | |
| 52 if len(sys.argv) != 6: | |
| 53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") | |
| 54 | |
| 55 organism = sys.argv[1] | |
| 56 if organism not in ["euk", "gram+", "gram-"]: | |
| 57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) | |
| 58 | |
| 59 try: | |
| 60 truncate = int(sys.argv[2]) | |
| 61 except: | |
| 62 truncate = 0 | |
| 63 if truncate < 0: | |
| 64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | |
| 65 | |
| 66 try: | |
| 67 num_threads = int(sys.argv[3]) | |
| 68 except: | |
| 69 num_threads = 0 | |
| 70 if num_threads < 1: | |
| 71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) | |
| 72 | |
| 73 fasta_file = sys.argv[4] | |
| 74 | |
| 75 tabular_file = sys.argv[5] | |
| 76 | |
| 77 def clean_tabular(raw_handle, out_handle): | |
| 78 """Clean up SignalP output to make it tabular.""" | |
| 79 for line in raw_handle: | |
| 80 if not line or line.startswith("#"): | |
| 81 continue | |
| 82 parts = line.rstrip("\r\n").split() | |
| 83 assert len(parts)==21, repr(line) | |
| 84 assert parts[14].startswith(parts[0]) | |
| 85 #Remove redundant truncated name column (col 0) | |
| 86 #and put full name at start (col 14) | |
| 87 parts = parts[14:15] + parts[1:14] + parts[15:] | |
| 88 out_handle.write("\t".join(parts) + "\n") | |
| 89 | |
| 90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | |
| 91 temp_files = [f+".out" for f in fasta_files] | |
| 92 assert len(fasta_files) == len(temp_files) | |
| 93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | |
| 94 for (fasta, temp) in zip(fasta_files, temp_files)] | |
| 95 assert len(fasta_files) == len(temp_files) == len(jobs) | |
| 96 | |
| 97 def clean_up(file_list): | |
| 98 for f in file_list: | |
| 99 if os.path.isfile(f): | |
| 100 os.remove(f) | |
| 101 | |
| 102 if len(jobs) > 1 and num_threads > 1: | |
| 103 #A small "info" message for Galaxy to show the user. | |
| 104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
| 105 results = run_jobs(jobs, num_threads) | |
| 106 assert len(fasta_files) == len(temp_files) == len(jobs) | |
| 107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
| 108 error_level = results[cmd] | |
| 109 try: | |
| 110 output = open(temp).readline() | |
| 111 except IOError: | |
| 112 output = "" | |
| 113 if error_level or output.lower().startswith("error running"): | |
| 114 clean_up(fasta_files) | |
| 115 clean_up(temp_files) | |
| 116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | |
| 117 error_level) | |
| 118 del results | |
| 119 | |
| 120 out_handle = open(tabular_file, "w") | |
| 121 fields = ["ID"] | |
| 122 #NN results: | |
| 123 for name in ["Cmax", "Ymax", "Smax"]: | |
| 124 fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) | |
| 125 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) | |
| 126 #HMM results: | |
| 127 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", | |
| 128 "HMM_Sprob_score", "HMM_Sprob_pred"]) | |
| 129 out_handle.write("#" + "\t".join(fields) + "\n") | |
| 130 for temp in temp_files: | |
| 131 data_handle = open(temp) | |
| 132 clean_tabular(data_handle, out_handle) | |
| 133 data_handle.close() | |
| 134 out_handle.close() | |
| 135 | |
| 136 clean_up(fasta_files) | |
| 137 clean_up(temp_files) |
