Mercurial > repos > petrn > repeat_annotation_pipeline
comparison get_contigs_from_re_archive.py @ 0:d14182506989 draft default tip
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
author | petrn |
---|---|
date | Tue, 15 Feb 2022 16:44:31 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d14182506989 |
---|---|
1 #!/usr/bin/env python | |
2 ''' | |
3 parse .aln file - output from cap3 program. Output is fasta file and | |
4 profile file | |
5 ''' | |
6 import argparse | |
7 import re | |
8 import zipfile | |
9 import tempfile | |
10 import textwrap | |
11 import os | |
12 | |
13 def parse_args(): | |
14 '''Argument parsin''' | |
15 description = """ | |
16 parsing cap3 assembly aln output | |
17 """ | |
18 | |
19 parser = argparse.ArgumentParser( | |
20 description=description, formatter_class=argparse.RawTextHelpFormatter) | |
21 parser.add_argument('-re', | |
22 '--re_file', | |
23 default=None, | |
24 required=True, | |
25 help="RepeatExlorer archive or directory", | |
26 type=str, | |
27 action='store') | |
28 parser.add_argument('-f', | |
29 '--fasta', | |
30 default=None, | |
31 required=True, | |
32 help="fasta output file name", | |
33 type=str, | |
34 action='store') | |
35 parser.add_argument('-m', | |
36 '--min_coverage', | |
37 default=5, | |
38 required=False, | |
39 help="minimum contig coverage", | |
40 type=int, | |
41 action="store") | |
42 parser.add_argument('-L', | |
43 '--min_contig_length', | |
44 default=50, | |
45 required=False, | |
46 help="minimum contig length", | |
47 type=int, | |
48 action="store") | |
49 return parser.parse_args() | |
50 | |
51 | |
52 def get_header(f): | |
53 aln_header = " . : . : . : . : . : . :" | |
54 contig_lead = "******************" | |
55 aln_start = -1 | |
56 while True: | |
57 line = f.readline() | |
58 if not line: | |
59 return None, None | |
60 if line[0:18] == contig_lead: | |
61 line2 = f.readline() | |
62 else: | |
63 continue | |
64 if aln_header in line2: | |
65 aln_start = line2.index(aln_header) | |
66 break | |
67 contig_name = line.split()[1] + line.split()[2] | |
68 return contig_name, aln_start | |
69 | |
70 | |
71 def segment_start(f): | |
72 pos = f.tell() | |
73 line = f.readline() | |
74 # detect next contig or end of file | |
75 if "********" in line or line == "" or "Number of segment pairs = " in line: | |
76 segment = False | |
77 else: | |
78 segment = True | |
79 f.seek(pos) | |
80 return segment | |
81 | |
82 | |
83 def get_segment(f, seq_start): | |
84 if not segment_start(f): | |
85 return None, None | |
86 aln = [] | |
87 while True: | |
88 line = f.readline() | |
89 if ". : . :" in line: | |
90 continue | |
91 if "__________" in line: | |
92 consensus = f.readline().rstrip('\n')[seq_start:] | |
93 f.readline() # empty line | |
94 break | |
95 else: | |
96 aln.append(line.rstrip('\n')[seq_start:]) | |
97 return aln, consensus | |
98 | |
99 | |
100 def aln2coverage(aln): | |
101 coverage = [0] * len(aln[0]) | |
102 for a in aln: | |
103 for i, c in enumerate(a): | |
104 if c not in " -": | |
105 coverage[i] += 1 | |
106 return coverage | |
107 | |
108 | |
109 def read_contig(f, seq_start): | |
110 contig = "" | |
111 coverage = [] | |
112 while True: | |
113 aln, consensus = get_segment(f, seq_start) | |
114 if aln: | |
115 contig += consensus | |
116 coverage += aln2coverage(aln) | |
117 else: | |
118 break | |
119 return contig, coverage | |
120 | |
121 | |
122 def remove_gaps(consensus, coverage): | |
123 if "-" not in consensus: | |
124 return consensus, coverage | |
125 new_coverage = [ | |
126 cov for cons, cov in zip(consensus, coverage) if cons != "-" | |
127 ] | |
128 new_consensus = consensus.replace("-", "") | |
129 return new_consensus, new_coverage | |
130 | |
131 | |
132 def extract_contigs_from_re_archive(archive, aln_output): | |
133 with zipfile.ZipFile(archive, 'r') as zip_object, open(aln_output, | |
134 'w') as fout: | |
135 flist = zip_object.infolist() | |
136 for fn in flist: | |
137 if re.match('seqclust.+[.]aln$', fn.filename): | |
138 with zip_object.open(fn.filename) as aln: | |
139 for l in aln: | |
140 fout.write(l.decode('utf-8')) | |
141 return aln_output | |
142 | |
143 def read_tarean_fasta(fobj): | |
144 ids = [] | |
145 s = [] | |
146 for i in fobj: | |
147 if isinstance(i, str): | |
148 ii = i | |
149 else: | |
150 ii = i.decode('utf-8') | |
151 if ii[0] == ">": | |
152 ids.append(ii) | |
153 s.append("") | |
154 else: | |
155 s[-1] = s[-1] + ii.strip() | |
156 return ids, s | |
157 | |
158 | |
159 def extract_contigs_from_re_directory(re_dir, aln_output): | |
160 with open(aln_output, 'w') as fout: | |
161 for subdir, dirs, files in os.walk(re_dir): | |
162 for fn in files: | |
163 fn_full = subdir + os.sep + fn | |
164 if re.match('^.+seqclust.+[.]aln$', fn_full): | |
165 print(fn_full) | |
166 with open(fn_full) as aln: | |
167 for l in aln: | |
168 fout.write(l) | |
169 return aln_output | |
170 | |
171 | |
172 | |
173 def extract_tarean_contigs_from_re_archive(archive): | |
174 with zipfile.ZipFile(archive, 'r') as zip_object: | |
175 flist = zip_object.infolist() | |
176 seqs_all = [] | |
177 ids_all = [] | |
178 for fn in flist: | |
179 if re.match("seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn.filename): | |
180 print(fn.filename) | |
181 with zip_object.open(fn.filename) as fobj: | |
182 ids, seqs = read_tarean_fasta(fobj) | |
183 # wrap sequences | |
184 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] | |
185 seqs_all += seqs | |
186 ids_all += ids | |
187 return ids_all, seqs_all | |
188 | |
189 | |
190 def extract_tarean_contigs_from_re_directory(re_dir): | |
191 seqs_all = [] | |
192 ids_all = [] | |
193 for subdir, dirs, files in os.walk(re_dir): | |
194 for fn in files: | |
195 fn_full = subdir + os.sep + fn | |
196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full): | |
197 print (fn_full) | |
198 with open(fn_full) as fobj: | |
199 ids, seqs = read_tarean_fasta(fobj) | |
200 # wrap sequences | |
201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] | |
202 seqs_all += seqs | |
203 ids_all += ids | |
204 return ids_all, seqs_all | |
205 | |
206 def filter_contigs(consensus, coverage, min_coverage=5): | |
207 x = "".join([ | |
208 s if cov >= min_coverage else " " | |
209 for s, cov in zip(consensus, coverage) | |
210 ]).strip() | |
211 consensus_N = "\n".join(textwrap.wrap(x.replace(" ", "N"),80)) | |
212 return consensus_N | |
213 | |
214 | |
215 def main(): | |
216 args = parse_args() | |
217 if os.path.isdir(args.re_file): | |
218 # extract from directory | |
219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file) | |
220 aln_file = extract_contigs_from_re_directory( | |
221 args.re_file, | |
222 tempfile.NamedTemporaryFile().name) | |
223 else: | |
224 # extract aln from archive | |
225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) | |
226 aln_file = extract_contigs_from_re_archive( | |
227 args.re_file, | |
228 tempfile.NamedTemporaryFile().name) | |
229 | |
230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: | |
231 while True: | |
232 contig_name, seq_start = get_header(f1) | |
233 if contig_name: | |
234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) | |
235 clean_consensus = filter_contigs(consensus, coverage, | |
236 args.min_coverage) | |
237 if len(clean_consensus) >= args.min_contig_length: | |
238 ffasta.write(">{}\n".format(contig_name)) | |
239 ffasta.write("{}\n".format(clean_consensus)) | |
240 else: | |
241 break | |
242 | |
243 # write tarean sequences: | |
244 for i, s in zip(ids, seqs): | |
245 ffasta.write(i) | |
246 ffasta.write(s + "\n") | |
247 | |
248 | |
249 | |
250 if __name__ == "__main__": | |
251 | |
252 main() |