comparison get_contigs_from_re_archive.py @ 0:d14182506989 draft default tip

"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
author petrn
date Tue, 15 Feb 2022 16:44:31 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d14182506989
1 #!/usr/bin/env python
2 '''
3 parse .aln file - output from cap3 program. Output is fasta file and
4 profile file
5 '''
6 import argparse
7 import re
8 import zipfile
9 import tempfile
10 import textwrap
11 import os
12
13 def parse_args():
14 '''Argument parsin'''
15 description = """
16 parsing cap3 assembly aln output
17 """
18
19 parser = argparse.ArgumentParser(
20 description=description, formatter_class=argparse.RawTextHelpFormatter)
21 parser.add_argument('-re',
22 '--re_file',
23 default=None,
24 required=True,
25 help="RepeatExlorer archive or directory",
26 type=str,
27 action='store')
28 parser.add_argument('-f',
29 '--fasta',
30 default=None,
31 required=True,
32 help="fasta output file name",
33 type=str,
34 action='store')
35 parser.add_argument('-m',
36 '--min_coverage',
37 default=5,
38 required=False,
39 help="minimum contig coverage",
40 type=int,
41 action="store")
42 parser.add_argument('-L',
43 '--min_contig_length',
44 default=50,
45 required=False,
46 help="minimum contig length",
47 type=int,
48 action="store")
49 return parser.parse_args()
50
51
52 def get_header(f):
53 aln_header = " . : . : . : . : . : . :"
54 contig_lead = "******************"
55 aln_start = -1
56 while True:
57 line = f.readline()
58 if not line:
59 return None, None
60 if line[0:18] == contig_lead:
61 line2 = f.readline()
62 else:
63 continue
64 if aln_header in line2:
65 aln_start = line2.index(aln_header)
66 break
67 contig_name = line.split()[1] + line.split()[2]
68 return contig_name, aln_start
69
70
71 def segment_start(f):
72 pos = f.tell()
73 line = f.readline()
74 # detect next contig or end of file
75 if "********" in line or line == "" or "Number of segment pairs = " in line:
76 segment = False
77 else:
78 segment = True
79 f.seek(pos)
80 return segment
81
82
83 def get_segment(f, seq_start):
84 if not segment_start(f):
85 return None, None
86 aln = []
87 while True:
88 line = f.readline()
89 if ". : . :" in line:
90 continue
91 if "__________" in line:
92 consensus = f.readline().rstrip('\n')[seq_start:]
93 f.readline() # empty line
94 break
95 else:
96 aln.append(line.rstrip('\n')[seq_start:])
97 return aln, consensus
98
99
100 def aln2coverage(aln):
101 coverage = [0] * len(aln[0])
102 for a in aln:
103 for i, c in enumerate(a):
104 if c not in " -":
105 coverage[i] += 1
106 return coverage
107
108
109 def read_contig(f, seq_start):
110 contig = ""
111 coverage = []
112 while True:
113 aln, consensus = get_segment(f, seq_start)
114 if aln:
115 contig += consensus
116 coverage += aln2coverage(aln)
117 else:
118 break
119 return contig, coverage
120
121
122 def remove_gaps(consensus, coverage):
123 if "-" not in consensus:
124 return consensus, coverage
125 new_coverage = [
126 cov for cons, cov in zip(consensus, coverage) if cons != "-"
127 ]
128 new_consensus = consensus.replace("-", "")
129 return new_consensus, new_coverage
130
131
132 def extract_contigs_from_re_archive(archive, aln_output):
133 with zipfile.ZipFile(archive, 'r') as zip_object, open(aln_output,
134 'w') as fout:
135 flist = zip_object.infolist()
136 for fn in flist:
137 if re.match('seqclust.+[.]aln$', fn.filename):
138 with zip_object.open(fn.filename) as aln:
139 for l in aln:
140 fout.write(l.decode('utf-8'))
141 return aln_output
142
143 def read_tarean_fasta(fobj):
144 ids = []
145 s = []
146 for i in fobj:
147 if isinstance(i, str):
148 ii = i
149 else:
150 ii = i.decode('utf-8')
151 if ii[0] == ">":
152 ids.append(ii)
153 s.append("")
154 else:
155 s[-1] = s[-1] + ii.strip()
156 return ids, s
157
158
159 def extract_contigs_from_re_directory(re_dir, aln_output):
160 with open(aln_output, 'w') as fout:
161 for subdir, dirs, files in os.walk(re_dir):
162 for fn in files:
163 fn_full = subdir + os.sep + fn
164 if re.match('^.+seqclust.+[.]aln$', fn_full):
165 print(fn_full)
166 with open(fn_full) as aln:
167 for l in aln:
168 fout.write(l)
169 return aln_output
170
171
172
173 def extract_tarean_contigs_from_re_archive(archive):
174 with zipfile.ZipFile(archive, 'r') as zip_object:
175 flist = zip_object.infolist()
176 seqs_all = []
177 ids_all = []
178 for fn in flist:
179 if re.match("seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn.filename):
180 print(fn.filename)
181 with zip_object.open(fn.filename) as fobj:
182 ids, seqs = read_tarean_fasta(fobj)
183 # wrap sequences
184 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
185 seqs_all += seqs
186 ids_all += ids
187 return ids_all, seqs_all
188
189
190 def extract_tarean_contigs_from_re_directory(re_dir):
191 seqs_all = []
192 ids_all = []
193 for subdir, dirs, files in os.walk(re_dir):
194 for fn in files:
195 fn_full = subdir + os.sep + fn
196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full):
197 print (fn_full)
198 with open(fn_full) as fobj:
199 ids, seqs = read_tarean_fasta(fobj)
200 # wrap sequences
201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
202 seqs_all += seqs
203 ids_all += ids
204 return ids_all, seqs_all
205
206 def filter_contigs(consensus, coverage, min_coverage=5):
207 x = "".join([
208 s if cov >= min_coverage else " "
209 for s, cov in zip(consensus, coverage)
210 ]).strip()
211 consensus_N = "\n".join(textwrap.wrap(x.replace(" ", "N"),80))
212 return consensus_N
213
214
215 def main():
216 args = parse_args()
217 if os.path.isdir(args.re_file):
218 # extract from directory
219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file)
220 aln_file = extract_contigs_from_re_directory(
221 args.re_file,
222 tempfile.NamedTemporaryFile().name)
223 else:
224 # extract aln from archive
225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file)
226 aln_file = extract_contigs_from_re_archive(
227 args.re_file,
228 tempfile.NamedTemporaryFile().name)
229
230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta:
231 while True:
232 contig_name, seq_start = get_header(f1)
233 if contig_name:
234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
235 clean_consensus = filter_contigs(consensus, coverage,
236 args.min_coverage)
237 if len(clean_consensus) >= args.min_contig_length:
238 ffasta.write(">{}\n".format(contig_name))
239 ffasta.write("{}\n".format(clean_consensus))
240 else:
241 break
242
243 # write tarean sequences:
244 for i, s in zip(ids, seqs):
245 ffasta.write(i)
246 ffasta.write(s + "\n")
247
248
249
250 if __name__ == "__main__":
251
252 main()