Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/rxlr_motifs.py @ 18:2b35b5c4b7f4 draft
Uploaded v0.2.5, preview 2, fixed bug in RXLR tools
| author | peterjc | 
|---|---|
| date | Fri, 10 May 2013 07:48:26 -0400 | 
| parents | 39a6e46cdda3 | 
| children | 20139cb4c844 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 17:af3174637834 | 18:2b35b5c4b7f4 | 
|---|---|
| 109 "whisson_et_al_rxlr_eer_cropped.hmm") | 109 "whisson_et_al_rxlr_eer_cropped.hmm") | 
| 110 if not os.path.isfile(hmm_file): | 110 if not os.path.isfile(hmm_file): | 
| 111 stop_err("Missing HMM file for Whisson et al. (2007)") | 111 stop_err("Missing HMM file for Whisson et al. (2007)") | 
| 112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 
| 113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) | 113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) | 
| 114 #I've left the code to handle HMMER 3 in situ, in case | 114 | 
| 115 #we revisit the choice to insist on HMMER 2. | |
| 116 hmmer3 = (3 == get_hmmer_version(hmmer_search)) | |
| 117 #Using zero (or 5.6?) for bitscore threshold | |
| 118 if hmmer3: | |
| 119 #The HMMER3 table output is easy to parse | |
| 120 #In HMMER3 can't use both -T and -E | |
| 121 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | |
| 122 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | |
| 123 else: | |
| 124 #For HMMER2 we are stuck with parsing stdout | |
| 125 #Put 1e6 to effectively have no expectation threshold (otherwise | |
| 126 #HMMER defaults to 10 and the calculated e-value depends on the | |
| 127 #input FASTA file, and we can loose hits of interest). | |
| 128 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | |
| 129 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | |
| 130 return_code = os.system(cmd) | |
| 131 if return_code: | |
| 132 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd)) | |
| 133 hmm_hits = set() | 115 hmm_hits = set() | 
| 134 valid_ids = set() | 116 valid_ids = set() | 
| 135 for title, seq in fasta_iterator(fasta_file): | 117 for title, seq in fasta_iterator(fasta_file): | 
| 136 name = title.split(None,1)[0] | 118 name = title.split(None,1)[0] | 
| 137 if name in valid_ids: | 119 if name in valid_ids: | 
| 138 stop_err("Duplicated identifier %r" % name) | 120 stop_err("Duplicated identifier %r" % name) | 
| 139 else: | 121 else: | 
| 140 valid_ids.add(name) | 122 valid_ids.add(name) | 
| 141 handle = open(hmm_output_file) | 123 if not valid_ids: | 
| 142 for line in handle: | 124 #Special case, don't need to run HMMER if there are no sequences | 
| 143 if not line.strip(): | 125 pass | 
| 144 #We expect blank lines in the HMMER2 stdout | 126 else: | 
| 145 continue | 127 #I've left the code to handle HMMER 3 in situ, in case | 
| 146 elif line.startswith("#"): | 128 #we revisit the choice to insist on HMMER 2. | 
| 147 #Header | 129 hmmer3 = (3 == get_hmmer_version(hmmer_search)) | 
| 148 continue | 130 #Using zero (or 5.6?) for bitscore threshold | 
| 131 if hmmer3: | |
| 132 #The HMMER3 table output is easy to parse | |
| 133 #In HMMER3 can't use both -T and -E | |
| 134 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | |
| 135 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | |
| 149 else: | 136 else: | 
| 150 name = line.split(None,1)[0] | 137 #For HMMER2 we are stuck with parsing stdout | 
| 151 #Should be a sequence name in the HMMER3 table output. | 138 #Put 1e6 to effectively have no expectation threshold (otherwise | 
| 152 #Could be anything in the HMMER2 stdout. | 139 #HMMER defaults to 10 and the calculated e-value depends on the | 
| 153 if name in valid_ids: | 140 #input FASTA file, and we can loose hits of interest). | 
| 154 hmm_hits.add(name) | 141 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | 
| 155 elif hmmer3: | 142 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | 
| 156 stop_err("Unexpected identifer %r in hmmsearch output" % name) | 143 return_code = os.system(cmd) | 
| 157 handle.close() | 144 if return_code: | 
| 158 #if hmmer3: | 145 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) | 
| 159 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 146 | 
| 160 #else: | 147 handle = open(hmm_output_file) | 
| 161 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 148 for line in handle: | 
| 162 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | 149 if not line.strip(): | 
| 163 os.remove(hmm_output_file) | 150 #We expect blank lines in the HMMER2 stdout | 
| 151 continue | |
| 152 elif line.startswith("#"): | |
| 153 #Header | |
| 154 continue | |
| 155 else: | |
| 156 name = line.split(None,1)[0] | |
| 157 #Should be a sequence name in the HMMER3 table output. | |
| 158 #Could be anything in the HMMER2 stdout. | |
| 159 if name in valid_ids: | |
| 160 hmm_hits.add(name) | |
| 161 elif hmmer3: | |
| 162 stop_err("Unexpected identifer %r in hmmsearch output" % name) | |
| 163 handle.close() | |
| 164 #if hmmer3: | |
| 165 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | |
| 166 #else: | |
| 167 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | |
| 168 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | |
| 169 os.remove(hmm_output_file) | |
| 164 del valid_ids | 170 del valid_ids | 
| 165 | 171 | 
| 166 | 172 | 
| 167 #Prepare short list of candidates containing RXLR to pass to SignalP | 173 #Prepare short list of candidates containing RXLR to pass to SignalP | 
| 168 assert min_rxlr_start > 0, "Min value one, since zero based counting" | 174 assert min_rxlr_start > 0, "Min value one, since zero based counting" | 
