comparison align_trim.py @ 0:defebd1f95b9 draft

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/artic_align_trim commit 374121821497c96c8450afda266951c2f431ba11-dirty"
author dfornika
date Tue, 10 Mar 2020 22:03:49 +0000
parents
children 26516cf26444
comparison
equal deleted inserted replaced
-1:000000000000 0:defebd1f95b9
1 #!/usr/bin/env python
2
3 # Written by Nick Loman
4 # Originally part of the ZiBRA pipeline (zibraproject.org)
5 # This file adapted from ARTICnetwork 'fieldbioinformatics' pipeline:
6 # https://github.com/artic-network/fieldbioinformatics
7
8 import sys
9
10 from collections import defaultdict
11 from copy import copy
12
13 import pysam
14
15 def read_bed_file(fn):
16 bedfile = []
17 with open(fn) as csvfile:
18 reader = csv.reader(csvfile, dialect='excel-tab')
19 for row in reader:
20 bedrow = {}
21 bedrow['Primer_ID'] = row[3]
22
23 if len(row) >= 6:
24 # new style bed
25 bedrow['direction'] = row[5]
26 elif len(row) == 5:
27 # old style without directory
28 if 'LEFT' in row[3]:
29 bedrow['direction'] = '+'
30 elif 'RIGHT' in row[3]:
31 bedrow['direction'] = '-'
32 else:
33 print("Malformed BED file!", file=sys.stderr)
34 raise SystemExit
35
36 if bedrow['direction'] == '+':
37 bedrow['end'] = int(row[2])
38 bedrow['start'] = int(row[1])
39 else:
40 bedrow['end'] = int(row[1])
41 bedrow['start'] = int(row[2])
42 bedfile.append(bedrow)
43 return bedfile
44
45
46 def check_still_matching_bases(s):
47 for flag, length in s.cigartuples:
48 if flag == 0:
49 return True
50 return False
51
52 def trim(args, cigar, s, start_pos, end):
53 if not end:
54 pos = s.pos
55 else:
56 pos = s.reference_end
57
58 eaten = 0
59 while 1:
60 ## chomp stuff off until we reach pos
61 if end:
62 flag, length = cigar.pop()
63 else:
64 flag, length = cigar.pop(0)
65
66 if args.verbose:
67 print("Chomped a %s, %s" % (flag, length), file=sys.stderr)
68
69 if flag == 0:
70 ## match
71 #to_trim -= length
72 eaten += length
73 if not end:
74 pos += length
75 else:
76 pos -= length
77 if flag == 1:
78 ## insertion to the ref
79 #to_trim -= length
80 eaten += length
81 if flag == 2:
82 ## deletion to the ref
83 #eaten += length
84 if not end:
85 pos += length
86 else:
87 pos -= length
88 pass
89 if flag == 4:
90 eaten += length
91 if not end and pos >= start_pos and flag == 0:
92 break
93 if end and pos <= start_pos and flag == 0:
94 break
95
96 #print >>sys.stderr, "pos:%s %s" % (pos, start_pos)
97
98 extra = abs(pos - start_pos)
99 if args.verbose:
100 print("extra %s" % (extra), file=sys.stderr)
101 if extra:
102 if flag == 0:
103 if args.verbose:
104 print("Inserted a %s, %s" % (0, extra), file=sys.stderr)
105
106 if end:
107 cigar.append((0, extra))
108 else:
109 cigar.insert(0, (0, extra))
110 eaten -= extra
111
112 if not end:
113 s.pos = pos - extra
114
115 if args.verbose:
116 print("New pos: %s" % (s.pos), file=sys.stderr)
117
118 if end:
119 cigar.append((4, eaten))
120 else:
121 cigar.insert(0, (4, eaten))
122 oldcigarstring = s.cigarstring
123 s.cigartuples = cigar
124
125 #print >>sys.stderr, s.query_name, oldcigarstring[0:50], s.cigarstring[0:50]
126
127 def find_primer(bed, pos, direction):
128 from operator import itemgetter
129
130 closest = min([(abs(p['start'] - pos), p['start'] - pos, p) for p in bed if p['direction'] == direction], key=itemgetter(0))
131 return closest
132
133 def is_correctly_paired(p1, p2):
134 name1 = p1[2]['Primer_ID']
135 name2 = p2[2]['Primer_ID']
136
137 name1 = name1.replace('_LEFT', '')
138 name2 = name2.replace('_RIGHT', '')
139
140 return name1 == name2
141
142 def go(args):
143 if args.report:
144 reportfh = open(args.report, "w")
145 print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh)
146
147 bed = read_bed_file(args.bedfile)
148
149 counter = defaultdict(int)
150
151 infile = pysam.AlignmentFile("-", "rb")
152 outfile = pysam.AlignmentFile("-", "wh", template=infile)
153 for s in infile:
154 cigar = copy(s.cigartuples)
155
156 ## logic - if alignment start site is _before_ but within X bases of
157 ## a primer site, trim it off
158
159 if s.is_unmapped:
160 print("%s skipped as unmapped" % (s.query_name), file=sys.stderr)
161 continue
162
163 if s.is_supplementary:
164 print("%s skipped as supplementary" % (s.query_name), file=sys.stderr)
165 continue
166
167 p1 = find_primer(bed, s.reference_start, '+')
168 p2 = find_primer(bed, s.reference_end, '-')
169
170 correctly_paired = is_correctly_paired(p1, p2)
171
172 report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (s.query_name, s.reference_start, s.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs(p1[1]), p2[2]['Primer_ID'], abs(p2[1]), s.is_secondary, s.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired)
173 if args.report:
174 print(report, file=reportfh)
175
176 if args.verbose:
177 print(report, file=sys.stderr)
178
179 ## if the alignment starts before the end of the primer, trim to that position
180
181 try:
182 if args.start:
183 primer_position = p1[2]['start']
184 else:
185 primer_position = p1[2]['end']
186
187 if s.reference_start < primer_position:
188 trim(args, cigar, s, primer_position, 0)
189 else:
190 if args.verbose:
191 print("ref start %s >= primer_position %s" % (s.reference_start, primer_position), file=sys.stderr)
192
193 if args.start:
194 primer_position = p2[2]['start']
195 else:
196 primer_position = p2[2]['end']
197
198 if s.reference_end > primer_position:
199 trim(args, cigar, s, primer_position, 1)
200 else:
201 if args.verbose:
202 print("ref end %s >= primer_position %s" % (s.reference_end, primer_position), file=sys.stderr)
203 except Exception as e:
204 print("problem %s" % (e,), file=sys.stderr)
205 pass
206
207 if args.normalise:
208 pair = "%s-%s-%d" % (p1[2]['Primer_ID'], p2[2]['Primer_ID'], s.is_reverse)
209 counter[pair] += 1
210
211 if counter[pair] > args.normalise:
212 continue
213
214 if not check_still_matching_bases(s):
215 continue
216
217 outfile.write(s)
218
219 reportfh.close()
220
221 def main():
222 import argparse
223
224 parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.')
225 parser.add_argument('bedfile', help='BED file containing the amplicon scheme')
226 parser.add_argument('--normalise', type=int, help='Subsample to n coverage per strand')
227 parser.add_argument('--report', type=str, help='Output report to file')
228 parser.add_argument('--start', action='store_true', help='Trim to start of primers instead of ends')
229 parser.add_argument('--verbose', action='store_true', help='Debug mode')
230
231 args = parser.parse_args()
232 go(args)
233
234
235 if __name__ == "__main__":
236 main()