Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/rxlr_motifs.py @ 29:3cb02adf4326 draft
v0.2.9 Python style improvements
| author | peterjc | 
|---|---|
| date | Wed, 01 Feb 2017 09:46:14 -0500 | 
| parents | 20139cb4c844 | 
| children | 6d9d7cdf00fc | 
   comparison
  equal
  deleted
  inserted
  replaced
| 28:22e71e53f534 | 29:3cb02adf4326 | 
|---|---|
| 38 """ | 38 """ | 
| 39 import os | 39 import os | 
| 40 import sys | 40 import sys | 
| 41 import re | 41 import re | 
| 42 import subprocess | 42 import subprocess | 
| 43 from seq_analysis_utils import sys_exit, fasta_iterator | 43 from seq_analysis_utils import fasta_iterator | 
| 44 | 44 | 
| 45 if "-v" in sys.argv: | 45 if "-v" in sys.argv: | 
| 46 print("RXLR Motifs v0.0.10") | 46 print("RXLR Motifs v0.0.10") | 
| 47 sys.exit(0) | 47 sys.exit(0) | 
| 48 | 48 | 
| 49 if len(sys.argv) != 5: | 49 if len(sys.argv) != 5: | 
| 50 sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") | 50 sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") | 
| 51 | 51 | 
| 52 fasta_file, threads, model, tabular_file = sys.argv[1:] | 52 fasta_file, threads, model, tabular_file = sys.argv[1:] | 
| 53 hmm_output_file = tabular_file + ".hmm.tmp" | 53 hmm_output_file = tabular_file + ".hmm.tmp" | 
| 54 signalp_input_file = tabular_file + ".fasta.tmp" | 54 signalp_input_file = tabular_file + ".fasta.tmp" | 
| 55 signalp_output_file = tabular_file + ".tabular.tmp" | 55 signalp_output_file = tabular_file + ".tabular.tmp" | 
| 84 max_sp = 40 | 84 max_sp = 40 | 
| 85 max_sp_rxlr = 100 | 85 max_sp_rxlr = 100 | 
| 86 min_rxlr_start = 1 | 86 min_rxlr_start = 1 | 
| 87 max_rxlr_start = max_sp + max_sp_rxlr | 87 max_rxlr_start = max_sp + max_sp_rxlr | 
| 88 else: | 88 else: | 
| 89 sys_exit("Did not recognise the model name %r\n" | 89 sys.exit("Did not recognise the model name %r\n" | 
| 90 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) | 90 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) | 
| 91 | 91 | 
| 92 | 92 | 
| 93 def get_hmmer_version(exe, required=None): | 93 def get_hmmer_version(exe, required=None): | 
| 94 cmd = "%s -h" % exe | 94 cmd = "%s -h" % exe | 
| 95 try: | 95 try: | 
| 103 return 2 | 103 return 2 | 
| 104 elif "HMMER 3" in stdout: | 104 elif "HMMER 3" in stdout: | 
| 105 return 3 | 105 return 3 | 
| 106 else: | 106 else: | 
| 107 raise ValueError("Could not determine version of %s" % exe) | 107 raise ValueError("Could not determine version of %s" % exe) | 
| 108 | 108 | 
| 109 | 109 | 
| 110 #Run hmmsearch for Whisson et al. (2007) | 110 # Run hmmsearch for Whisson et al. (2007) | 
| 111 if model == "Whisson2007": | 111 if model == "Whisson2007": | 
| 112 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], | 112 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], | 
| 113 "whisson_et_al_rxlr_eer_cropped.hmm") | 113 "whisson_et_al_rxlr_eer_cropped.hmm") | 
| 114 if not os.path.isfile(hmm_file): | 114 if not os.path.isfile(hmm_file): | 
| 115 sys_exit("Missing HMM file for Whisson et al. (2007)") | 115 sys.exit("Missing HMM file for Whisson et al. (2007)") | 
| 116 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 116 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 
| 117 sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) | 117 sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) | 
| 118 | 118 | 
| 119 hmm_hits = set() | 119 hmm_hits = set() | 
| 120 valid_ids = set() | 120 valid_ids = set() | 
| 121 for title, seq in fasta_iterator(fasta_file): | 121 for title, seq in fasta_iterator(fasta_file): | 
| 122 name = title.split(None,1)[0] | 122 name = title.split(None, 1)[0] | 
| 123 if name in valid_ids: | 123 if name in valid_ids: | 
| 124 sys_exit("Duplicated identifier %r" % name) | 124 sys.exit("Duplicated identifier %r" % name) | 
| 125 else: | 125 else: | 
| 126 valid_ids.add(name) | 126 valid_ids.add(name) | 
| 127 if not valid_ids: | 127 if not valid_ids: | 
| 128 # Special case, don't need to run HMMER if there are no sequences | 128 # Special case, don't need to run HMMER if there are no sequences | 
| 129 pass | 129 pass | 
| 144 # input FASTA file, and we can loose hits of interest). | 144 # input FASTA file, and we can loose hits of interest). | 
| 145 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | 145 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | 
| 146 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | 146 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | 
| 147 return_code = os.system(cmd) | 147 return_code = os.system(cmd) | 
| 148 if return_code: | 148 if return_code: | 
| 149 sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) | 149 sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) | 
| 150 | 150 | 
| 151 handle = open(hmm_output_file) | 151 handle = open(hmm_output_file) | 
| 152 for line in handle: | 152 for line in handle: | 
| 153 if not line.strip(): | 153 if not line.strip(): | 
| 154 # We expect blank lines in the HMMER2 stdout | 154 # We expect blank lines in the HMMER2 stdout | 
| 155 continue | 155 continue | 
| 156 elif line.startswith("#"): | 156 elif line.startswith("#"): | 
| 157 # Header | 157 # Header | 
| 158 continue | 158 continue | 
| 159 else: | 159 else: | 
| 160 name = line.split(None,1)[0] | 160 name = line.split(None, 1)[0] | 
| 161 #Should be a sequence name in the HMMER3 table output. | 161 # Should be a sequence name in the HMMER3 table output. | 
| 162 #Could be anything in the HMMER2 stdout. | 162 # Could be anything in the HMMER2 stdout. | 
| 163 if name in valid_ids: | 163 if name in valid_ids: | 
| 164 hmm_hits.add(name) | 164 hmm_hits.add(name) | 
| 165 elif hmmer3: | 165 elif hmmer3: | 
| 166 sys_exit("Unexpected identifer %r in hmmsearch output" % name) | 166 sys.exit("Unexpected identifer %r in hmmsearch output" % name) | 
| 167 handle.close() | 167 handle.close() | 
| 168 # if hmmer3: | 168 # if hmmer3: | 
| 169 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 169 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 
| 170 # else: | 170 # else: | 
| 171 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 171 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 
| 172 # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | 172 # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | 
| 173 os.remove(hmm_output_file) | 173 os.remove(hmm_output_file) | 
| 174 del valid_ids | 174 del valid_ids | 
| 175 | 175 | 
| 176 | 176 | 
| 179 count = 0 | 179 count = 0 | 
| 180 total = 0 | 180 total = 0 | 
| 181 handle = open(signalp_input_file, "w") | 181 handle = open(signalp_input_file, "w") | 
| 182 for title, seq in fasta_iterator(fasta_file): | 182 for title, seq in fasta_iterator(fasta_file): | 
| 183 total += 1 | 183 total += 1 | 
| 184 name = title.split(None,1)[0] | 184 name = title.split(None, 1)[0] | 
| 185 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) | 185 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) | 
| 186 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 186 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 
| 187 # This is a potential RXLR, depending on the SignalP results. | 187 # This is a potential RXLR, depending on the SignalP results. | 
| 188 # Might as well truncate the sequence now, makes the temp file smaller | 188 # Might as well truncate the sequence now, makes the temp file smaller | 
| 189 if signalp_trunc: | 189 if signalp_trunc: | 
| 190 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) | 190 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) | 
| 197 | 197 | 
| 198 | 198 | 
| 199 # Run SignalP (using our wrapper script to get multi-core support etc) | 199 # Run SignalP (using our wrapper script to get multi-core support etc) | 
| 200 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") | 200 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") | 
| 201 if not os.path.isfile(signalp_script): | 201 if not os.path.isfile(signalp_script): | 
| 202 sys_exit("Error - missing signalp3.py script") | 202 sys.exit("Error - missing signalp3.py script") | 
| 203 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) | 203 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) | 
| 204 return_code = os.system(cmd) | 204 return_code = os.system(cmd) | 
| 205 if return_code: | 205 if return_code: | 
| 206 sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) | 206 sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) | 
| 207 # print "SignalP done" | 207 # print "SignalP done" | 
| 208 | 208 | 
| 209 | 209 | 
| 210 def parse_signalp(filename): | 210 def parse_signalp(filename): | 
| 211 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. | 211 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. | 
| 215 handle = open(filename) | 215 handle = open(filename) | 
| 216 line = handle.readline() | 216 line = handle.readline() | 
| 217 assert line.startswith("#ID\t"), line | 217 assert line.startswith("#ID\t"), line | 
| 218 for line in handle: | 218 for line in handle: | 
| 219 parts = line.rstrip("\t").split("\t") | 219 parts = line.rstrip("\t").split("\t") | 
| 220 assert len(parts)==20, repr(line) | 220 assert len(parts) == 20, repr(line) | 
| 221 yield parts[0], float(parts[18]), int(parts[5])-1 | 221 yield parts[0], float(parts[18]), int(parts[5]) - 1 | 
| 222 handle.close() | 222 handle.close() | 
| 223 | 223 | 
| 224 | 224 | 
| 225 # Parse SignalP results and apply the strict RXLR criteria | 225 # Parse SignalP results and apply the strict RXLR criteria | 
| 226 total = 0 | 226 total = 0 | 
| 229 handle.write("#ID\t%s\n" % model) | 229 handle.write("#ID\t%s\n" % model) | 
| 230 signalp_results = parse_signalp(signalp_output_file) | 230 signalp_results = parse_signalp(signalp_output_file) | 
| 231 for title, seq in fasta_iterator(fasta_file): | 231 for title, seq in fasta_iterator(fasta_file): | 
| 232 total += 1 | 232 total += 1 | 
| 233 rxlr = "N" | 233 rxlr = "N" | 
| 234 name = title.split(None,1)[0] | 234 name = title.split(None, 1)[0] | 
| 235 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) | 235 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) | 
| 236 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 236 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 
| 237 del match | 237 del match | 
| 238 # This was the criteria for calling SignalP, | 238 # This was the criteria for calling SignalP, | 
| 239 #so it will be in the SignalP results. | 239 # so it will be in the SignalP results. | 
| 240 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() | 240 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() | 
| 241 assert name == sp_id, "%s vs %s" % (name, sp_id) | 241 assert name == sp_id, "%s vs %s" % (name, sp_id) | 
| 242 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: | 242 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: | 
| 243 match = re_rxlr.search(seq[sp_nn_len:].upper()) | 243 match = re_rxlr.search(seq[sp_nn_len:].upper()) | 
| 244 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting | 244 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting | 
