Mercurial > repos > petrn > repeat_annotation_pipeline
comparison get_contigs_from_re_archive.py @ 0:d14182506989 draft default tip
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
| author | petrn |
|---|---|
| date | Tue, 15 Feb 2022 16:44:31 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d14182506989 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ''' | |
| 3 parse .aln file - output from cap3 program. Output is fasta file and | |
| 4 profile file | |
| 5 ''' | |
| 6 import argparse | |
| 7 import re | |
| 8 import zipfile | |
| 9 import tempfile | |
| 10 import textwrap | |
| 11 import os | |
| 12 | |
| 13 def parse_args(): | |
| 14 '''Argument parsin''' | |
| 15 description = """ | |
| 16 parsing cap3 assembly aln output | |
| 17 """ | |
| 18 | |
| 19 parser = argparse.ArgumentParser( | |
| 20 description=description, formatter_class=argparse.RawTextHelpFormatter) | |
| 21 parser.add_argument('-re', | |
| 22 '--re_file', | |
| 23 default=None, | |
| 24 required=True, | |
| 25 help="RepeatExlorer archive or directory", | |
| 26 type=str, | |
| 27 action='store') | |
| 28 parser.add_argument('-f', | |
| 29 '--fasta', | |
| 30 default=None, | |
| 31 required=True, | |
| 32 help="fasta output file name", | |
| 33 type=str, | |
| 34 action='store') | |
| 35 parser.add_argument('-m', | |
| 36 '--min_coverage', | |
| 37 default=5, | |
| 38 required=False, | |
| 39 help="minimum contig coverage", | |
| 40 type=int, | |
| 41 action="store") | |
| 42 parser.add_argument('-L', | |
| 43 '--min_contig_length', | |
| 44 default=50, | |
| 45 required=False, | |
| 46 help="minimum contig length", | |
| 47 type=int, | |
| 48 action="store") | |
| 49 return parser.parse_args() | |
| 50 | |
| 51 | |
| 52 def get_header(f): | |
| 53 aln_header = " . : . : . : . : . : . :" | |
| 54 contig_lead = "******************" | |
| 55 aln_start = -1 | |
| 56 while True: | |
| 57 line = f.readline() | |
| 58 if not line: | |
| 59 return None, None | |
| 60 if line[0:18] == contig_lead: | |
| 61 line2 = f.readline() | |
| 62 else: | |
| 63 continue | |
| 64 if aln_header in line2: | |
| 65 aln_start = line2.index(aln_header) | |
| 66 break | |
| 67 contig_name = line.split()[1] + line.split()[2] | |
| 68 return contig_name, aln_start | |
| 69 | |
| 70 | |
| 71 def segment_start(f): | |
| 72 pos = f.tell() | |
| 73 line = f.readline() | |
| 74 # detect next contig or end of file | |
| 75 if "********" in line or line == "" or "Number of segment pairs = " in line: | |
| 76 segment = False | |
| 77 else: | |
| 78 segment = True | |
| 79 f.seek(pos) | |
| 80 return segment | |
| 81 | |
| 82 | |
| 83 def get_segment(f, seq_start): | |
| 84 if not segment_start(f): | |
| 85 return None, None | |
| 86 aln = [] | |
| 87 while True: | |
| 88 line = f.readline() | |
| 89 if ". : . :" in line: | |
| 90 continue | |
| 91 if "__________" in line: | |
| 92 consensus = f.readline().rstrip('\n')[seq_start:] | |
| 93 f.readline() # empty line | |
| 94 break | |
| 95 else: | |
| 96 aln.append(line.rstrip('\n')[seq_start:]) | |
| 97 return aln, consensus | |
| 98 | |
| 99 | |
| 100 def aln2coverage(aln): | |
| 101 coverage = [0] * len(aln[0]) | |
| 102 for a in aln: | |
| 103 for i, c in enumerate(a): | |
| 104 if c not in " -": | |
| 105 coverage[i] += 1 | |
| 106 return coverage | |
| 107 | |
| 108 | |
| 109 def read_contig(f, seq_start): | |
| 110 contig = "" | |
| 111 coverage = [] | |
| 112 while True: | |
| 113 aln, consensus = get_segment(f, seq_start) | |
| 114 if aln: | |
| 115 contig += consensus | |
| 116 coverage += aln2coverage(aln) | |
| 117 else: | |
| 118 break | |
| 119 return contig, coverage | |
| 120 | |
| 121 | |
| 122 def remove_gaps(consensus, coverage): | |
| 123 if "-" not in consensus: | |
| 124 return consensus, coverage | |
| 125 new_coverage = [ | |
| 126 cov for cons, cov in zip(consensus, coverage) if cons != "-" | |
| 127 ] | |
| 128 new_consensus = consensus.replace("-", "") | |
| 129 return new_consensus, new_coverage | |
| 130 | |
| 131 | |
| 132 def extract_contigs_from_re_archive(archive, aln_output): | |
| 133 with zipfile.ZipFile(archive, 'r') as zip_object, open(aln_output, | |
| 134 'w') as fout: | |
| 135 flist = zip_object.infolist() | |
| 136 for fn in flist: | |
| 137 if re.match('seqclust.+[.]aln$', fn.filename): | |
| 138 with zip_object.open(fn.filename) as aln: | |
| 139 for l in aln: | |
| 140 fout.write(l.decode('utf-8')) | |
| 141 return aln_output | |
| 142 | |
| 143 def read_tarean_fasta(fobj): | |
| 144 ids = [] | |
| 145 s = [] | |
| 146 for i in fobj: | |
| 147 if isinstance(i, str): | |
| 148 ii = i | |
| 149 else: | |
| 150 ii = i.decode('utf-8') | |
| 151 if ii[0] == ">": | |
| 152 ids.append(ii) | |
| 153 s.append("") | |
| 154 else: | |
| 155 s[-1] = s[-1] + ii.strip() | |
| 156 return ids, s | |
| 157 | |
| 158 | |
| 159 def extract_contigs_from_re_directory(re_dir, aln_output): | |
| 160 with open(aln_output, 'w') as fout: | |
| 161 for subdir, dirs, files in os.walk(re_dir): | |
| 162 for fn in files: | |
| 163 fn_full = subdir + os.sep + fn | |
| 164 if re.match('^.+seqclust.+[.]aln$', fn_full): | |
| 165 print(fn_full) | |
| 166 with open(fn_full) as aln: | |
| 167 for l in aln: | |
| 168 fout.write(l) | |
| 169 return aln_output | |
| 170 | |
| 171 | |
| 172 | |
| 173 def extract_tarean_contigs_from_re_archive(archive): | |
| 174 with zipfile.ZipFile(archive, 'r') as zip_object: | |
| 175 flist = zip_object.infolist() | |
| 176 seqs_all = [] | |
| 177 ids_all = [] | |
| 178 for fn in flist: | |
| 179 if re.match("seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn.filename): | |
| 180 print(fn.filename) | |
| 181 with zip_object.open(fn.filename) as fobj: | |
| 182 ids, seqs = read_tarean_fasta(fobj) | |
| 183 # wrap sequences | |
| 184 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] | |
| 185 seqs_all += seqs | |
| 186 ids_all += ids | |
| 187 return ids_all, seqs_all | |
| 188 | |
| 189 | |
| 190 def extract_tarean_contigs_from_re_directory(re_dir): | |
| 191 seqs_all = [] | |
| 192 ids_all = [] | |
| 193 for subdir, dirs, files in os.walk(re_dir): | |
| 194 for fn in files: | |
| 195 fn_full = subdir + os.sep + fn | |
| 196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full): | |
| 197 print (fn_full) | |
| 198 with open(fn_full) as fobj: | |
| 199 ids, seqs = read_tarean_fasta(fobj) | |
| 200 # wrap sequences | |
| 201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs] | |
| 202 seqs_all += seqs | |
| 203 ids_all += ids | |
| 204 return ids_all, seqs_all | |
| 205 | |
| 206 def filter_contigs(consensus, coverage, min_coverage=5): | |
| 207 x = "".join([ | |
| 208 s if cov >= min_coverage else " " | |
| 209 for s, cov in zip(consensus, coverage) | |
| 210 ]).strip() | |
| 211 consensus_N = "\n".join(textwrap.wrap(x.replace(" ", "N"),80)) | |
| 212 return consensus_N | |
| 213 | |
| 214 | |
| 215 def main(): | |
| 216 args = parse_args() | |
| 217 if os.path.isdir(args.re_file): | |
| 218 # extract from directory | |
| 219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file) | |
| 220 aln_file = extract_contigs_from_re_directory( | |
| 221 args.re_file, | |
| 222 tempfile.NamedTemporaryFile().name) | |
| 223 else: | |
| 224 # extract aln from archive | |
| 225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file) | |
| 226 aln_file = extract_contigs_from_re_archive( | |
| 227 args.re_file, | |
| 228 tempfile.NamedTemporaryFile().name) | |
| 229 | |
| 230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta: | |
| 231 while True: | |
| 232 contig_name, seq_start = get_header(f1) | |
| 233 if contig_name: | |
| 234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) | |
| 235 clean_consensus = filter_contigs(consensus, coverage, | |
| 236 args.min_coverage) | |
| 237 if len(clean_consensus) >= args.min_contig_length: | |
| 238 ffasta.write(">{}\n".format(contig_name)) | |
| 239 ffasta.write("{}\n".format(clean_consensus)) | |
| 240 else: | |
| 241 break | |
| 242 | |
| 243 # write tarean sequences: | |
| 244 for i, s in zip(ids, seqs): | |
| 245 ffasta.write(i) | |
| 246 ffasta.write(s + "\n") | |
| 247 | |
| 248 | |
| 249 | |
| 250 if __name__ == "__main__": | |
| 251 | |
| 252 main() |
