annotate get_contigs_from_re_archive.py @ 0:d14182506989 draft default tip

"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
author petrn
date Tue, 15 Feb 2022 16:44:31 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
1 #!/usr/bin/env python
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
2 '''
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
3 parse .aln file - output from cap3 program. Output is fasta file and
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
4 profile file
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
5 '''
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
6 import argparse
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
7 import re
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
8 import zipfile
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
9 import tempfile
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
10 import textwrap
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
11 import os
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
12
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
13 def parse_args():
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
14 '''Argument parsin'''
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
15 description = """
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
16 parsing cap3 assembly aln output
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
17 """
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
18
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
19 parser = argparse.ArgumentParser(
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
20 description=description, formatter_class=argparse.RawTextHelpFormatter)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
21 parser.add_argument('-re',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
22 '--re_file',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
23 default=None,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
24 required=True,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
25 help="RepeatExlorer archive or directory",
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
26 type=str,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
27 action='store')
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
28 parser.add_argument('-f',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
29 '--fasta',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
30 default=None,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
31 required=True,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
32 help="fasta output file name",
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
33 type=str,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
34 action='store')
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
35 parser.add_argument('-m',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
36 '--min_coverage',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
37 default=5,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
38 required=False,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
39 help="minimum contig coverage",
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
40 type=int,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
41 action="store")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
42 parser.add_argument('-L',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
43 '--min_contig_length',
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
44 default=50,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
45 required=False,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
46 help="minimum contig length",
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
47 type=int,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
48 action="store")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
49 return parser.parse_args()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
50
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
51
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
52 def get_header(f):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
53 aln_header = " . : . : . : . : . : . :"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
54 contig_lead = "******************"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
55 aln_start = -1
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
56 while True:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
57 line = f.readline()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
58 if not line:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
59 return None, None
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
60 if line[0:18] == contig_lead:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
61 line2 = f.readline()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
62 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
63 continue
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
64 if aln_header in line2:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
65 aln_start = line2.index(aln_header)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
66 break
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
67 contig_name = line.split()[1] + line.split()[2]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
68 return contig_name, aln_start
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
69
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
70
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
71 def segment_start(f):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
72 pos = f.tell()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
73 line = f.readline()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
74 # detect next contig or end of file
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
75 if "********" in line or line == "" or "Number of segment pairs = " in line:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
76 segment = False
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
77 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
78 segment = True
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
79 f.seek(pos)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
80 return segment
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
81
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
82
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
83 def get_segment(f, seq_start):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
84 if not segment_start(f):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
85 return None, None
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
86 aln = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
87 while True:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
88 line = f.readline()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
89 if ". : . :" in line:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
90 continue
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
91 if "__________" in line:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
92 consensus = f.readline().rstrip('\n')[seq_start:]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
93 f.readline() # empty line
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
94 break
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
95 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
96 aln.append(line.rstrip('\n')[seq_start:])
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
97 return aln, consensus
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
98
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
99
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
100 def aln2coverage(aln):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
101 coverage = [0] * len(aln[0])
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
102 for a in aln:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
103 for i, c in enumerate(a):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
104 if c not in " -":
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
105 coverage[i] += 1
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
106 return coverage
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
107
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
108
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
109 def read_contig(f, seq_start):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
110 contig = ""
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
111 coverage = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
112 while True:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
113 aln, consensus = get_segment(f, seq_start)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
114 if aln:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
115 contig += consensus
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
116 coverage += aln2coverage(aln)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
117 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
118 break
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
119 return contig, coverage
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
120
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
121
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
122 def remove_gaps(consensus, coverage):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
123 if "-" not in consensus:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
124 return consensus, coverage
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
125 new_coverage = [
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
126 cov for cons, cov in zip(consensus, coverage) if cons != "-"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
127 ]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
128 new_consensus = consensus.replace("-", "")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
129 return new_consensus, new_coverage
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
130
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
131
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
132 def extract_contigs_from_re_archive(archive, aln_output):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
133 with zipfile.ZipFile(archive, 'r') as zip_object, open(aln_output,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
134 'w') as fout:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
135 flist = zip_object.infolist()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
136 for fn in flist:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
137 if re.match('seqclust.+[.]aln$', fn.filename):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
138 with zip_object.open(fn.filename) as aln:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
139 for l in aln:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
140 fout.write(l.decode('utf-8'))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
141 return aln_output
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
142
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
143 def read_tarean_fasta(fobj):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
144 ids = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
145 s = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
146 for i in fobj:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
147 if isinstance(i, str):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
148 ii = i
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
149 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
150 ii = i.decode('utf-8')
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
151 if ii[0] == ">":
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
152 ids.append(ii)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
153 s.append("")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
154 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
155 s[-1] = s[-1] + ii.strip()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
156 return ids, s
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
157
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
158
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
159 def extract_contigs_from_re_directory(re_dir, aln_output):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
160 with open(aln_output, 'w') as fout:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
161 for subdir, dirs, files in os.walk(re_dir):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
162 for fn in files:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
163 fn_full = subdir + os.sep + fn
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
164 if re.match('^.+seqclust.+[.]aln$', fn_full):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
165 print(fn_full)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
166 with open(fn_full) as aln:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
167 for l in aln:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
168 fout.write(l)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
169 return aln_output
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
170
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
171
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
172
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
173 def extract_tarean_contigs_from_re_archive(archive):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
174 with zipfile.ZipFile(archive, 'r') as zip_object:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
175 flist = zip_object.infolist()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
176 seqs_all = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
177 ids_all = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
178 for fn in flist:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
179 if re.match("seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn.filename):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
180 print(fn.filename)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
181 with zip_object.open(fn.filename) as fobj:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
182 ids, seqs = read_tarean_fasta(fobj)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
183 # wrap sequences
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
184 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
185 seqs_all += seqs
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
186 ids_all += ids
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
187 return ids_all, seqs_all
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
188
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
189
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
190 def extract_tarean_contigs_from_re_directory(re_dir):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
191 seqs_all = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
192 ids_all = []
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
193 for subdir, dirs, files in os.walk(re_dir):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
194 for fn in files:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
195 fn_full = subdir + os.sep + fn
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
197 print (fn_full)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
198 with open(fn_full) as fobj:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
199 ids, seqs = read_tarean_fasta(fobj)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
200 # wrap sequences
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
202 seqs_all += seqs
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
203 ids_all += ids
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
204 return ids_all, seqs_all
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
205
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
206 def filter_contigs(consensus, coverage, min_coverage=5):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
207 x = "".join([
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
208 s if cov >= min_coverage else " "
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
209 for s, cov in zip(consensus, coverage)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
210 ]).strip()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
211 consensus_N = "\n".join(textwrap.wrap(x.replace(" ", "N"),80))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
212 return consensus_N
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
213
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
214
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
215 def main():
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
216 args = parse_args()
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
217 if os.path.isdir(args.re_file):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
218 # extract from directory
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
220 aln_file = extract_contigs_from_re_directory(
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
221 args.re_file,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
222 tempfile.NamedTemporaryFile().name)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
223 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
224 # extract aln from archive
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
226 aln_file = extract_contigs_from_re_archive(
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
227 args.re_file,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
228 tempfile.NamedTemporaryFile().name)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
229
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
231 while True:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
232 contig_name, seq_start = get_header(f1)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
233 if contig_name:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
235 clean_consensus = filter_contigs(consensus, coverage,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
236 args.min_coverage)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
237 if len(clean_consensus) >= args.min_contig_length:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
238 ffasta.write(">{}\n".format(contig_name))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
239 ffasta.write("{}\n".format(clean_consensus))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
240 else:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
241 break
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
242
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
243 # write tarean sequences:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
244 for i, s in zip(ids, seqs):
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
245 ffasta.write(i)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
246 ffasta.write(s + "\n")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
247
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
248
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
249
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
250 if __name__ == "__main__":
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
251
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
252 main()