Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/tmhmm2.py @ 7:5e62aefb2918 draft
Uploaded v0.1.2 to Test Tool Shed
author | peterjc |
---|---|
date | Tue, 26 Mar 2013 14:24:56 -0400 |
parents | 747cec3192d3 |
children | 391a142c1e60 |
comparison
equal
deleted
inserted
replaced
6:39a6e46cdda3 | 7:5e62aefb2918 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """Wrapper for TMHMM v2.0 for use in Galaxy. | 2 """Wrapper for TMHMM v2.0 for use in Galaxy. |
3 | 3 |
4 This script takes exactly two command line arguments - an input protein FASTA | 4 This script takes exactly three command line arguments - number of threads, |
5 filename and an output tabular filename. It then calls the standalone TMHMM | 5 an input protein FASTA filename, and an output tabular filename. It then |
6 v2.0 program (not the webservice) requesting the short output (one line per | 6 calls the standalone TMHMM v2.0 program (not the webservice) requesting |
7 protein). | 7 the short output (one line per protein). |
8 | 8 |
9 The first major feature is cleaning up the tabular output. The short form raw | 9 The first major feature is cleaning up the tabular output. The short form raw |
10 output from TMHMM v2.0 looks like this (six columns tab separated): | 10 output from TMHMM v2.0 looks like this (six columns tab separated): |
11 | 11 |
12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o | 12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o |
31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file | 31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file |
32 into chunks and running multiple copies of TMHMM in parallel. I would normally | 32 into chunks and running multiple copies of TMHMM in parallel. I would normally |
33 use Python's multiprocessing library in this situation but it requires at | 33 use Python's multiprocessing library in this situation but it requires at |
34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. | 34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. |
35 | 35 |
36 Note that this is somewhat redundant with job-splitting available in Galaxy | |
37 itself (see the SignalP XML file for settings). | |
38 | |
36 Also tmhmm2 can fail without returning an error code, for example if run on a | 39 Also tmhmm2 can fail without returning an error code, for example if run on a |
37 64 bit machine with only the 32 bit binaries installed. This script will spot | 40 64 bit machine with only the 32 bit binaries installed. This script will spot |
38 when there is no output from tmhmm2, and raise an error. | 41 when there is no output from tmhmm2, and raise an error. |
39 """ | 42 """ |
40 import sys | 43 import sys |
41 import os | 44 import os |
42 from seq_analysis_utils import stop_err, split_fasta, run_jobs | 45 import tempfile |
46 from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count | |
43 | 47 |
44 FASTA_CHUNK = 500 | 48 FASTA_CHUNK = 500 |
45 | 49 |
46 if len(sys.argv) != 4: | 50 if len(sys.argv) != 4: |
47 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") | 51 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") |
48 try: | 52 |
49 num_threads = int(sys.argv[1]) | 53 num_threads = thread_count(sys.argv[1], default=4) |
50 except: | |
51 num_threads = 0 | |
52 if num_threads < 1: | |
53 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) | |
54 fasta_file = sys.argv[2] | 54 fasta_file = sys.argv[2] |
55 tabular_file = sys.argv[3] | 55 tabular_file = sys.argv[3] |
56 | |
57 tmp_dir = tempfile.mkdtemp() | |
56 | 58 |
57 def clean_tabular(raw_handle, out_handle): | 59 def clean_tabular(raw_handle, out_handle): |
58 """Clean up tabular TMHMM output, returns output line count.""" | 60 """Clean up tabular TMHMM output, returns output line count.""" |
59 count = 0 | 61 count = 0 |
60 for line in raw_handle: | 62 for line in raw_handle: |
82 count += 1 | 84 count += 1 |
83 return count | 85 return count |
84 | 86 |
85 #Note that if the input FASTA file contains no sequences, | 87 #Note that if the input FASTA file contains no sequences, |
86 #split_fasta returns an empty list (i.e. zero temp files). | 88 #split_fasta returns an empty list (i.e. zero temp files). |
87 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) | 89 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) |
88 temp_files = [f+".out" for f in fasta_files] | 90 temp_files = [f+".out" for f in fasta_files] |
89 jobs = ["tmhmm -short %s > %s" % (fasta, temp) | 91 jobs = ["tmhmm -short %s > %s" % (fasta, temp) |
90 for fasta, temp in zip(fasta_files, temp_files)] | 92 for fasta, temp in zip(fasta_files, temp_files)] |
91 | 93 |
92 def clean_up(file_list): | 94 def clean_up(file_list): |
93 for f in file_list: | 95 for f in file_list: |
94 if os.path.isfile(f): | 96 if os.path.isfile(f): |
95 os.remove(f) | 97 os.remove(f) |
98 try: | |
99 os.rmdir(tmp_dir) | |
100 except: | |
101 pass | |
96 | 102 |
97 if len(jobs) > 1 and num_threads > 1: | 103 if len(jobs) > 1 and num_threads > 1: |
98 #A small "info" message for Galaxy to show the user. | 104 #A small "info" message for Galaxy to show the user. |
99 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 105 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) |
100 results = run_jobs(jobs, num_threads) | 106 results = run_jobs(jobs, num_threads) |
103 if error_level: | 109 if error_level: |
104 try: | 110 try: |
105 output = open(temp).readline() | 111 output = open(temp).readline() |
106 except IOError: | 112 except IOError: |
107 output = "" | 113 output = "" |
108 clean_up(fasta_files) | 114 clean_up(fasta_files + temp_files) |
109 clean_up(temp_files) | |
110 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 115 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
111 error_level) | 116 error_level) |
112 del results | 117 del results |
113 del jobs | 118 del jobs |
114 | 119 |
117 for temp in temp_files: | 122 for temp in temp_files: |
118 data_handle = open(temp) | 123 data_handle = open(temp) |
119 count = clean_tabular(data_handle, out_handle) | 124 count = clean_tabular(data_handle, out_handle) |
120 data_handle.close() | 125 data_handle.close() |
121 if not count: | 126 if not count: |
122 clean_up(fasta_files) | 127 clean_up(fasta_files + temp_files) |
123 clean_up(temp_files) | |
124 stop_err("No output from tmhmm2") | 128 stop_err("No output from tmhmm2") |
125 out_handle.close() | 129 out_handle.close() |
126 | 130 |
127 clean_up(fasta_files) | 131 clean_up(fasta_files + temp_files) |
128 clean_up(temp_files) |