Mercurial > repos > dfornika > artic_align_trim
comparison align_trim.py @ 0:defebd1f95b9 draft
"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/artic_align_trim commit 374121821497c96c8450afda266951c2f431ba11-dirty"
author | dfornika |
---|---|
date | Tue, 10 Mar 2020 22:03:49 +0000 |
parents | |
children | 26516cf26444 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:defebd1f95b9 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Written by Nick Loman | |
4 # Originally part of the ZiBRA pipeline (zibraproject.org) | |
5 # This file adapted from ARTICnetwork 'fieldbioinformatics' pipeline: | |
6 # https://github.com/artic-network/fieldbioinformatics | |
7 | |
8 import sys | |
9 | |
10 from collections import defaultdict | |
11 from copy import copy | |
12 | |
13 import pysam | |
14 | |
15 def read_bed_file(fn): | |
16 bedfile = [] | |
17 with open(fn) as csvfile: | |
18 reader = csv.reader(csvfile, dialect='excel-tab') | |
19 for row in reader: | |
20 bedrow = {} | |
21 bedrow['Primer_ID'] = row[3] | |
22 | |
23 if len(row) >= 6: | |
24 # new style bed | |
25 bedrow['direction'] = row[5] | |
26 elif len(row) == 5: | |
27 # old style without directory | |
28 if 'LEFT' in row[3]: | |
29 bedrow['direction'] = '+' | |
30 elif 'RIGHT' in row[3]: | |
31 bedrow['direction'] = '-' | |
32 else: | |
33 print("Malformed BED file!", file=sys.stderr) | |
34 raise SystemExit | |
35 | |
36 if bedrow['direction'] == '+': | |
37 bedrow['end'] = int(row[2]) | |
38 bedrow['start'] = int(row[1]) | |
39 else: | |
40 bedrow['end'] = int(row[1]) | |
41 bedrow['start'] = int(row[2]) | |
42 bedfile.append(bedrow) | |
43 return bedfile | |
44 | |
45 | |
46 def check_still_matching_bases(s): | |
47 for flag, length in s.cigartuples: | |
48 if flag == 0: | |
49 return True | |
50 return False | |
51 | |
52 def trim(args, cigar, s, start_pos, end): | |
53 if not end: | |
54 pos = s.pos | |
55 else: | |
56 pos = s.reference_end | |
57 | |
58 eaten = 0 | |
59 while 1: | |
60 ## chomp stuff off until we reach pos | |
61 if end: | |
62 flag, length = cigar.pop() | |
63 else: | |
64 flag, length = cigar.pop(0) | |
65 | |
66 if args.verbose: | |
67 print("Chomped a %s, %s" % (flag, length), file=sys.stderr) | |
68 | |
69 if flag == 0: | |
70 ## match | |
71 #to_trim -= length | |
72 eaten += length | |
73 if not end: | |
74 pos += length | |
75 else: | |
76 pos -= length | |
77 if flag == 1: | |
78 ## insertion to the ref | |
79 #to_trim -= length | |
80 eaten += length | |
81 if flag == 2: | |
82 ## deletion to the ref | |
83 #eaten += length | |
84 if not end: | |
85 pos += length | |
86 else: | |
87 pos -= length | |
88 pass | |
89 if flag == 4: | |
90 eaten += length | |
91 if not end and pos >= start_pos and flag == 0: | |
92 break | |
93 if end and pos <= start_pos and flag == 0: | |
94 break | |
95 | |
96 #print >>sys.stderr, "pos:%s %s" % (pos, start_pos) | |
97 | |
98 extra = abs(pos - start_pos) | |
99 if args.verbose: | |
100 print("extra %s" % (extra), file=sys.stderr) | |
101 if extra: | |
102 if flag == 0: | |
103 if args.verbose: | |
104 print("Inserted a %s, %s" % (0, extra), file=sys.stderr) | |
105 | |
106 if end: | |
107 cigar.append((0, extra)) | |
108 else: | |
109 cigar.insert(0, (0, extra)) | |
110 eaten -= extra | |
111 | |
112 if not end: | |
113 s.pos = pos - extra | |
114 | |
115 if args.verbose: | |
116 print("New pos: %s" % (s.pos), file=sys.stderr) | |
117 | |
118 if end: | |
119 cigar.append((4, eaten)) | |
120 else: | |
121 cigar.insert(0, (4, eaten)) | |
122 oldcigarstring = s.cigarstring | |
123 s.cigartuples = cigar | |
124 | |
125 #print >>sys.stderr, s.query_name, oldcigarstring[0:50], s.cigarstring[0:50] | |
126 | |
127 def find_primer(bed, pos, direction): | |
128 from operator import itemgetter | |
129 | |
130 closest = min([(abs(p['start'] - pos), p['start'] - pos, p) for p in bed if p['direction'] == direction], key=itemgetter(0)) | |
131 return closest | |
132 | |
133 def is_correctly_paired(p1, p2): | |
134 name1 = p1[2]['Primer_ID'] | |
135 name2 = p2[2]['Primer_ID'] | |
136 | |
137 name1 = name1.replace('_LEFT', '') | |
138 name2 = name2.replace('_RIGHT', '') | |
139 | |
140 return name1 == name2 | |
141 | |
142 def go(args): | |
143 if args.report: | |
144 reportfh = open(args.report, "w") | |
145 print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh) | |
146 | |
147 bed = read_bed_file(args.bedfile) | |
148 | |
149 counter = defaultdict(int) | |
150 | |
151 infile = pysam.AlignmentFile("-", "rb") | |
152 outfile = pysam.AlignmentFile("-", "wh", template=infile) | |
153 for s in infile: | |
154 cigar = copy(s.cigartuples) | |
155 | |
156 ## logic - if alignment start site is _before_ but within X bases of | |
157 ## a primer site, trim it off | |
158 | |
159 if s.is_unmapped: | |
160 print("%s skipped as unmapped" % (s.query_name), file=sys.stderr) | |
161 continue | |
162 | |
163 if s.is_supplementary: | |
164 print("%s skipped as supplementary" % (s.query_name), file=sys.stderr) | |
165 continue | |
166 | |
167 p1 = find_primer(bed, s.reference_start, '+') | |
168 p2 = find_primer(bed, s.reference_end, '-') | |
169 | |
170 correctly_paired = is_correctly_paired(p1, p2) | |
171 | |
172 report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (s.query_name, s.reference_start, s.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs(p1[1]), p2[2]['Primer_ID'], abs(p2[1]), s.is_secondary, s.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired) | |
173 if args.report: | |
174 print(report, file=reportfh) | |
175 | |
176 if args.verbose: | |
177 print(report, file=sys.stderr) | |
178 | |
179 ## if the alignment starts before the end of the primer, trim to that position | |
180 | |
181 try: | |
182 if args.start: | |
183 primer_position = p1[2]['start'] | |
184 else: | |
185 primer_position = p1[2]['end'] | |
186 | |
187 if s.reference_start < primer_position: | |
188 trim(args, cigar, s, primer_position, 0) | |
189 else: | |
190 if args.verbose: | |
191 print("ref start %s >= primer_position %s" % (s.reference_start, primer_position), file=sys.stderr) | |
192 | |
193 if args.start: | |
194 primer_position = p2[2]['start'] | |
195 else: | |
196 primer_position = p2[2]['end'] | |
197 | |
198 if s.reference_end > primer_position: | |
199 trim(args, cigar, s, primer_position, 1) | |
200 else: | |
201 if args.verbose: | |
202 print("ref end %s >= primer_position %s" % (s.reference_end, primer_position), file=sys.stderr) | |
203 except Exception as e: | |
204 print("problem %s" % (e,), file=sys.stderr) | |
205 pass | |
206 | |
207 if args.normalise: | |
208 pair = "%s-%s-%d" % (p1[2]['Primer_ID'], p2[2]['Primer_ID'], s.is_reverse) | |
209 counter[pair] += 1 | |
210 | |
211 if counter[pair] > args.normalise: | |
212 continue | |
213 | |
214 if not check_still_matching_bases(s): | |
215 continue | |
216 | |
217 outfile.write(s) | |
218 | |
219 reportfh.close() | |
220 | |
221 def main(): | |
222 import argparse | |
223 | |
224 parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.') | |
225 parser.add_argument('bedfile', help='BED file containing the amplicon scheme') | |
226 parser.add_argument('--normalise', type=int, help='Subsample to n coverage per strand') | |
227 parser.add_argument('--report', type=str, help='Output report to file') | |
228 parser.add_argument('--start', action='store_true', help='Trim to start of primers instead of ends') | |
229 parser.add_argument('--verbose', action='store_true', help='Debug mode') | |
230 | |
231 args = parser.parse_args() | |
232 go(args) | |
233 | |
234 | |
235 if __name__ == "__main__": | |
236 main() |