Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/rxlr_motifs.py @ 6:39a6e46cdda3
Migrated tool version 0.0.9 from old tool shed archive to new tool shed repository
| author | peterjc |
|---|---|
| date | Tue, 07 Jun 2011 17:41:38 -0400 |
| parents | |
| children | 2b35b5c4b7f4 |
comparison
equal
deleted
inserted
replaced
| 5:ef7ceca37e3f | 6:39a6e46cdda3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """Implements assorted RXLR motif methods from the literature | |
| 3 | |
| 4 This script takes exactly four command line arguments: | |
| 5 * Protein FASTA filename | |
| 6 * Number of threads | |
| 7 * Model name (Bhattacharjee2006, Win2007, Whisson2007) | |
| 8 * Output tabular filename | |
| 9 | |
| 10 The model names are: | |
| 11 | |
| 12 Bhattacharjee2006: Simple regular expression search for RXLR | |
| 13 with additional requirements for positioning and signal peptide. | |
| 14 | |
| 15 Win2007: Simple regular expression search for RXLR, but with | |
| 16 different positional requirements. | |
| 17 | |
| 18 Whisson2007: As Bhattacharjee2006 but with a more complex regular | |
| 19 expression to look for RXLR-EER domain, and additionally calls HMMER. | |
| 20 | |
| 21 See the help text in the accompanying Galaxy tool XML file for more | |
| 22 details including the full references. | |
| 23 | |
| 24 Note: | |
| 25 | |
| 26 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0, | |
| 27 which is no longer available. The current release is SignalP v3.0 | |
| 28 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for | |
| 29 the predicted cleavage site, as this is expected to be more accurate. | |
| 30 Also note that the HMM score values have changed from v2.0 to v3.0. | |
| 31 Whisson et al. (2007) used SignalP v3.0 anyway. | |
| 32 | |
| 33 Whisson et al. (2007) used HMMER 2.3.2, and althought their HMM model | |
| 34 can still be used with hmmsearch from HMMER 3 this this does give | |
| 35 slightly different results. We expect the hmmsearch from HMMER 2.3.2 | |
| 36 (the last stable release of HMMER 2) to be present on the path under | |
| 37 the name hmmsearch2 (allowing it to co-exist with HMMER 3). | |
| 38 """ | |
| 39 import os | |
| 40 import sys | |
| 41 import re | |
| 42 import subprocess | |
| 43 from seq_analysis_utils import stop_err, fasta_iterator | |
| 44 | |
| 45 if len(sys.argv) != 5: | |
| 46 stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename") | |
| 47 | |
| 48 fasta_file, threads, model, tabular_file = sys.argv[1:] | |
| 49 hmm_output_file = tabular_file + ".hmm.tmp" | |
| 50 signalp_input_file = tabular_file + ".fasta.tmp" | |
| 51 signalp_output_file = tabular_file + ".tabular.tmp" | |
| 52 min_signalp_hmm = 0.9 | |
| 53 hmmer_search = "hmmsearch2" | |
| 54 | |
| 55 if model == "Bhattacharjee2006": | |
| 56 signalp_trunc = 70 | |
| 57 re_rxlr = re.compile("R.LR") | |
| 58 min_sp = 10 | |
| 59 max_sp = 40 | |
| 60 max_sp_rxlr = 100 | |
| 61 min_rxlr_start = 1 | |
| 62 #Allow signal peptide to be at most 40aa, and want RXLR to be | |
| 63 #within 100aa, therefore for the prescreen the max start is 140: | |
| 64 max_rxlr_start = max_sp + max_sp_rxlr | |
| 65 elif model == "Win2007": | |
| 66 signalp_trunc = 70 | |
| 67 re_rxlr = re.compile("R.LR") | |
| 68 min_sp = 10 | |
| 69 max_sp = 40 | |
| 70 min_rxlr_start = 30 | |
| 71 max_rxlr_start = 60 | |
| 72 #No explicit limit on separation of signal peptide clevage | |
| 73 #and RXLR, but shortest signal peptide is 10, and furthest | |
| 74 #away RXLR is 60, so effectively limit is 50. | |
| 75 max_sp_rxlr = max_rxlr_start - min_sp + 1 | |
| 76 elif model == "Whisson2007": | |
| 77 signalp_trunc = 0 #zero for no truncation | |
| 78 re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") | |
| 79 min_sp = 10 | |
| 80 max_sp = 40 | |
| 81 max_sp_rxlr = 100 | |
| 82 min_rxlr_start = 1 | |
| 83 max_rxlr_start = max_sp + max_sp_rxlr | |
| 84 else: | |
| 85 stop_err("Did not recognise the model name %r\n" | |
| 86 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) | |
| 87 | |
| 88 | |
| 89 def get_hmmer_version(exe, required=None): | |
| 90 cmd = "%s -h" % exe | |
| 91 try: | |
| 92 child = subprocess.Popen([exe, "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| 93 except OSError: | |
| 94 raise ValueError("Could not run %s" % exe) | |
| 95 stdout, stderr = child.communicate() | |
| 96 if required: | |
| 97 return required in stdout | |
| 98 elif "HMMER 2" in stdout: | |
| 99 return 2 | |
| 100 elif "HMMER 3" in stdout: | |
| 101 return 3 | |
| 102 else: | |
| 103 raise ValueError("Could not determine version of %s" % exe) | |
| 104 | |
| 105 | |
| 106 #Run hmmsearch for Whisson et al. (2007) | |
| 107 if model == "Whisson2007": | |
| 108 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], | |
| 109 "whisson_et_al_rxlr_eer_cropped.hmm") | |
| 110 if not os.path.isfile(hmm_file): | |
| 111 stop_err("Missing HMM file for Whisson et al. (2007)") | |
| 112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | |
| 113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) | |
| 114 #I've left the code to handle HMMER 3 in situ, in case | |
| 115 #we revisit the choice to insist on HMMER 2. | |
| 116 hmmer3 = (3 == get_hmmer_version(hmmer_search)) | |
| 117 #Using zero (or 5.6?) for bitscore threshold | |
| 118 if hmmer3: | |
| 119 #The HMMER3 table output is easy to parse | |
| 120 #In HMMER3 can't use both -T and -E | |
| 121 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | |
| 122 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | |
| 123 else: | |
| 124 #For HMMER2 we are stuck with parsing stdout | |
| 125 #Put 1e6 to effectively have no expectation threshold (otherwise | |
| 126 #HMMER defaults to 10 and the calculated e-value depends on the | |
| 127 #input FASTA file, and we can loose hits of interest). | |
| 128 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | |
| 129 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | |
| 130 return_code = os.system(cmd) | |
| 131 if return_code: | |
| 132 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd)) | |
| 133 hmm_hits = set() | |
| 134 valid_ids = set() | |
| 135 for title, seq in fasta_iterator(fasta_file): | |
| 136 name = title.split(None,1)[0] | |
| 137 if name in valid_ids: | |
| 138 stop_err("Duplicated identifier %r" % name) | |
| 139 else: | |
| 140 valid_ids.add(name) | |
| 141 handle = open(hmm_output_file) | |
| 142 for line in handle: | |
| 143 if not line.strip(): | |
| 144 #We expect blank lines in the HMMER2 stdout | |
| 145 continue | |
| 146 elif line.startswith("#"): | |
| 147 #Header | |
| 148 continue | |
| 149 else: | |
| 150 name = line.split(None,1)[0] | |
| 151 #Should be a sequence name in the HMMER3 table output. | |
| 152 #Could be anything in the HMMER2 stdout. | |
| 153 if name in valid_ids: | |
| 154 hmm_hits.add(name) | |
| 155 elif hmmer3: | |
| 156 stop_err("Unexpected identifer %r in hmmsearch output" % name) | |
| 157 handle.close() | |
| 158 #if hmmer3: | |
| 159 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | |
| 160 #else: | |
| 161 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | |
| 162 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | |
| 163 os.remove(hmm_output_file) | |
| 164 del valid_ids | |
| 165 | |
| 166 | |
| 167 #Prepare short list of candidates containing RXLR to pass to SignalP | |
| 168 assert min_rxlr_start > 0, "Min value one, since zero based counting" | |
| 169 count = 0 | |
| 170 total = 0 | |
| 171 handle = open(signalp_input_file, "w") | |
| 172 for title, seq in fasta_iterator(fasta_file): | |
| 173 total += 1 | |
| 174 name = title.split(None,1)[0] | |
| 175 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) | |
| 176 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | |
| 177 #This is a potential RXLR, depending on the SignalP results. | |
| 178 #Might as well truncate the sequence now, makes the temp file smaller | |
| 179 if signalp_trunc: | |
| 180 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) | |
| 181 else: | |
| 182 #Does it matter we don't line wrap? | |
| 183 handle.write(">%s\n%s\n" % (name, seq)) | |
| 184 count += 1 | |
| 185 handle.close() | |
| 186 #print "Running SignalP on %i/%i potentials." % (count, total) | |
| 187 | |
| 188 | |
| 189 #Run SignalP (using our wrapper script to get multi-core support etc) | |
| 190 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") | |
| 191 if not os.path.isfile(signalp_script): | |
| 192 stop_err("Error - missing signalp3.py script") | |
| 193 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) | |
| 194 return_code = os.system(cmd) | |
| 195 if return_code: | |
| 196 stop_err("Error %i from SignalP:\n%s" % (return_code, cmd)) | |
| 197 #print "SignalP done" | |
| 198 | |
| 199 def parse_signalp(filename): | |
| 200 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. | |
| 201 | |
| 202 For signal peptide length we use NN_Ymax_pos (minus one). | |
| 203 """ | |
| 204 handle = open(filename) | |
| 205 line = handle.readline() | |
| 206 assert line.startswith("#ID\t"), line | |
| 207 for line in handle: | |
| 208 parts = line.rstrip("\t").split("\t") | |
| 209 assert len(parts)==20, repr(line) | |
| 210 yield parts[0], float(parts[18]), int(parts[5])-1 | |
| 211 handle.close() | |
| 212 | |
| 213 | |
| 214 #Parse SignalP results and apply the strict RXLR criteria | |
| 215 total = 0 | |
| 216 tally = dict() | |
| 217 handle = open(tabular_file, "w") | |
| 218 handle.write("#ID\t%s\n" % model) | |
| 219 signalp_results = parse_signalp(signalp_output_file) | |
| 220 for title, seq in fasta_iterator(fasta_file): | |
| 221 total += 1 | |
| 222 rxlr = "N" | |
| 223 name = title.split(None,1)[0] | |
| 224 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) | |
| 225 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | |
| 226 del match | |
| 227 #This was the criteria for calling SignalP, | |
| 228 #so it will be in the SignalP results. | |
| 229 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() | |
| 230 assert name == sp_id, "%s vs %s" % (name, sp_id) | |
| 231 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: | |
| 232 match = re_rxlr.search(seq[sp_nn_len:].upper()) | |
| 233 if match and match.start() + 1 <= max_sp_rxlr: #1-based counting | |
| 234 rxlr_start = sp_nn_len + match.start() + 1 | |
| 235 if min_rxlr_start <= rxlr_start <= max_rxlr_start: | |
| 236 rxlr = "Y" | |
| 237 if model == "Whisson2007": | |
| 238 #Combine the signalp with regular expression heuristic and the HMM | |
| 239 if name in hmm_hits and rxlr == "N": | |
| 240 rxlr = "hmm" #HMM only | |
| 241 elif rxlr == "N": | |
| 242 rxlr = "neither" #Don't use N (no) | |
| 243 elif name not in hmm_hits and rxlr == "Y": | |
| 244 rxlr = "re" #Heuristic only | |
| 245 #Now have a four way classifier: Y, hmm, re, neither | |
| 246 #and count is the number of Y results (both HMM and heuristic) | |
| 247 handle.write("%s\t%s\n" % (name, rxlr)) | |
| 248 try: | |
| 249 tally[rxlr] += 1 | |
| 250 except KeyError: | |
| 251 tally[rxlr] = 1 | |
| 252 handle.close() | |
| 253 assert sum(tally.values()) == total | |
| 254 | |
| 255 #Check the iterator is finished | |
| 256 try: | |
| 257 signalp_results.next() | |
| 258 assert False, "Unexpected data in SignalP output" | |
| 259 except StopIteration: | |
| 260 pass | |
| 261 | |
| 262 #Cleanup | |
| 263 os.remove(signalp_input_file) | |
| 264 os.remove(signalp_output_file) | |
| 265 | |
| 266 #Short summary to stdout for Galaxy's info display | |
| 267 print "%s for %i sequences:" % (model, total) | |
| 268 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems())) |
