Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/psortb.py @ 28:22e71e53f534 draft
Capture version of promoter2
author | peterjc |
---|---|
date | Tue, 01 Sep 2015 08:24:49 -0400 |
parents | 20139cb4c844 |
children | 3cb02adf4326 |
rev | line source |
---|---|
8 | 1 #!/usr/bin/env python |
2 """Wrapper for psortb for use in Galaxy. | |
3 | |
4 This script takes exactly six command line arguments - which includes the | |
5 number of threads, and the input protein FASTA filename and output | |
6 tabular filename. It then splits up the FASTA input and calls multiple | |
7 copies of the standalone psortb v3 program, then collates the output. | |
8 e.g. Rather than this, | |
9 | |
10 psort $type -c $cutoff -d $divergent -o long $sequence > $outfile | |
11 | |
12 Call this: | |
13 | |
14 psort $threads $type $cutoff $divergent $sequence $outfile | |
15 | |
16 If ommitting -c or -d options, set $cutoff and $divergent to zero or blank. | |
17 | |
18 Note that this is somewhat redundant with job-splitting available in Galaxy | |
19 itself (see the SignalP XML file for settings), but both can be applied. | |
20 | |
21 Additionally it ensures the header line (with the column names) starts | |
22 with a # character as used elsewhere in Galaxy. | |
23 """ | |
24 import sys | |
25 import os | |
26 import tempfile | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
27 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count |
8 | 28 |
29 FASTA_CHUNK = 500 | |
30 | |
31 if "-v" in sys.argv or "--version" in sys.argv: | |
32 """Return underlying PSORTb's version""" | |
33 sys.exit(os.system("psort --version")) | |
34 | |
35 if len(sys.argv) != 8: | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
36 sys_exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " |
8 | 37 "output (e.g. terse/normal/long), cutoff, divergent, input protein " |
38 "FASTA file & output tabular file") | |
39 | |
40 num_threads = thread_count(sys.argv[1], default=4) | |
41 org_type = sys.argv[2] | |
42 out_type = sys.argv[3] | |
43 cutoff = sys.argv[4] | |
44 if cutoff.strip() and float(cutoff.strip()) != 0.0: | |
45 cutoff = "-c %s" % cutoff | |
46 else: | |
47 cutoff = "" | |
48 divergent = sys.argv[5] | |
49 if divergent.strip() and float(divergent.strip()) != 0.0: | |
50 divergent = "-d %s" % divergent | |
51 else: | |
52 divergent = "" | |
53 fasta_file = sys.argv[6] | |
54 tabular_file = sys.argv[7] | |
55 | |
56 if out_type == "terse": | |
57 header = ['SeqID', 'Localization', 'Score'] | |
58 elif out_type == "normal": | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
59 sys_exit("Normal output not implemented yet, sorry.") |
8 | 60 elif out_type == "long": |
61 if org_type == "-n": | |
62 #Gram negative bacteria | |
63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', | |
64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', | |
65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', | |
66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', | |
67 'Profile-_Localization', 'Profile-_Details', | |
68 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', | |
69 'Signal-_Localization', 'Signal-_Details', | |
70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', | |
71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | |
72 'Secondary_Localization', 'PSortb_Version'] | |
73 elif org_type == "-p": | |
74 #Gram positive bacteria | |
75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', | |
76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', | |
77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', | |
78 'Profile+_Localization', 'Profile+_Details', | |
79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', | |
80 'Signal+_Localization', 'Signal+_Details', | |
81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', | |
82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | |
83 'Secondary_Localization', 'PSortb_Version'] | |
84 elif org_type == "-a": | |
85 #Archaea | |
86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', | |
87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', | |
88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', | |
89 'Profile_a_Localization', 'Profile_a_Details', | |
90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', | |
91 'Signal_a_Localization', 'Signal_a_Details', | |
92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', | |
93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | |
94 'Secondary_Localization', 'PSortb_Version'] | |
95 else: | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
96 sys_exit("Expected -n, -p or -a for the organism type, not %r" % org_type) |
8 | 97 else: |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
98 sys_exit("Expected terse, normal or long for the output type, not %r" % out_type) |
8 | 99 |
100 tmp_dir = tempfile.mkdtemp() | |
101 | |
102 def clean_tabular(raw_handle, out_handle): | |
103 """Clean up tabular TMHMM output, returns output line count.""" | |
104 global header | |
105 count = 0 | |
106 for line in raw_handle: | |
107 if not line.strip() or line.startswith("#"): | |
108 #Ignore any blank lines or comment lines | |
109 continue | |
110 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] | |
111 if parts == header: | |
112 #Ignore the header line | |
113 continue | |
114 if not parts[-1] and len(parts) == len(header) + 1: | |
115 #Ignore dummy blank extra column, e.g. | |
116 #"...2.0\t\tPSORTb version 3.0\t\n" | |
117 parts = parts[:-1] | |
118 assert len(parts) == len(header), \ | |
119 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) | |
120 out_handle.write(line) | |
121 count += 1 | |
122 return count | |
123 | |
124 #Note that if the input FASTA file contains no sequences, | |
125 #split_fasta returns an empty list (i.e. zero temp files). | |
126 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) | |
127 temp_files = [f+".out" for f in fasta_files] | |
128 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) | |
129 for fasta, temp in zip(fasta_files, temp_files)] | |
130 | |
131 def clean_up(file_list): | |
132 for f in file_list: | |
133 if os.path.isfile(f): | |
134 os.remove(f) | |
135 try: | |
136 os.rmdir(tmp_dir) | |
137 except: | |
138 pass | |
139 | |
140 if len(jobs) > 1 and num_threads > 1: | |
141 #A small "info" message for Galaxy to show the user. | |
142 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
143 results = run_jobs(jobs, num_threads) | |
144 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
145 error_level = results[cmd] | |
146 if error_level: | |
147 try: | |
148 output = open(temp).readline() | |
149 except IOError: | |
150 output = "" | |
151 clean_up(fasta_files + temp_files) | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
152 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
8 | 153 error_level) |
154 del results | |
155 del jobs | |
156 | |
157 out_handle = open(tabular_file, "w") | |
158 out_handle.write("#%s\n" % "\t".join(header)) | |
11 | 159 count = 0 |
8 | 160 for temp in temp_files: |
161 data_handle = open(temp) | |
11 | 162 count += clean_tabular(data_handle, out_handle) |
8 | 163 data_handle.close() |
164 if not count: | |
165 clean_up(fasta_files + temp_files) | |
26
20139cb4c844
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
peterjc
parents:
11
diff
changeset
|
166 sys_exit("No output from psortb") |
8 | 167 out_handle.close() |
168 print "%i records" % count | |
169 | |
170 clean_up(fasta_files + temp_files) |