Mercurial > repos > dfornika > artic_align_trim
comparison align_trim.py @ 0:defebd1f95b9 draft
"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/artic_align_trim commit 374121821497c96c8450afda266951c2f431ba11-dirty"
| author | dfornika |
|---|---|
| date | Tue, 10 Mar 2020 22:03:49 +0000 |
| parents | |
| children | 26516cf26444 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:defebd1f95b9 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Written by Nick Loman | |
| 4 # Originally part of the ZiBRA pipeline (zibraproject.org) | |
| 5 # This file adapted from ARTICnetwork 'fieldbioinformatics' pipeline: | |
| 6 # https://github.com/artic-network/fieldbioinformatics | |
| 7 | |
| 8 import sys | |
| 9 | |
| 10 from collections import defaultdict | |
| 11 from copy import copy | |
| 12 | |
| 13 import pysam | |
| 14 | |
| 15 def read_bed_file(fn): | |
| 16 bedfile = [] | |
| 17 with open(fn) as csvfile: | |
| 18 reader = csv.reader(csvfile, dialect='excel-tab') | |
| 19 for row in reader: | |
| 20 bedrow = {} | |
| 21 bedrow['Primer_ID'] = row[3] | |
| 22 | |
| 23 if len(row) >= 6: | |
| 24 # new style bed | |
| 25 bedrow['direction'] = row[5] | |
| 26 elif len(row) == 5: | |
| 27 # old style without directory | |
| 28 if 'LEFT' in row[3]: | |
| 29 bedrow['direction'] = '+' | |
| 30 elif 'RIGHT' in row[3]: | |
| 31 bedrow['direction'] = '-' | |
| 32 else: | |
| 33 print("Malformed BED file!", file=sys.stderr) | |
| 34 raise SystemExit | |
| 35 | |
| 36 if bedrow['direction'] == '+': | |
| 37 bedrow['end'] = int(row[2]) | |
| 38 bedrow['start'] = int(row[1]) | |
| 39 else: | |
| 40 bedrow['end'] = int(row[1]) | |
| 41 bedrow['start'] = int(row[2]) | |
| 42 bedfile.append(bedrow) | |
| 43 return bedfile | |
| 44 | |
| 45 | |
| 46 def check_still_matching_bases(s): | |
| 47 for flag, length in s.cigartuples: | |
| 48 if flag == 0: | |
| 49 return True | |
| 50 return False | |
| 51 | |
| 52 def trim(args, cigar, s, start_pos, end): | |
| 53 if not end: | |
| 54 pos = s.pos | |
| 55 else: | |
| 56 pos = s.reference_end | |
| 57 | |
| 58 eaten = 0 | |
| 59 while 1: | |
| 60 ## chomp stuff off until we reach pos | |
| 61 if end: | |
| 62 flag, length = cigar.pop() | |
| 63 else: | |
| 64 flag, length = cigar.pop(0) | |
| 65 | |
| 66 if args.verbose: | |
| 67 print("Chomped a %s, %s" % (flag, length), file=sys.stderr) | |
| 68 | |
| 69 if flag == 0: | |
| 70 ## match | |
| 71 #to_trim -= length | |
| 72 eaten += length | |
| 73 if not end: | |
| 74 pos += length | |
| 75 else: | |
| 76 pos -= length | |
| 77 if flag == 1: | |
| 78 ## insertion to the ref | |
| 79 #to_trim -= length | |
| 80 eaten += length | |
| 81 if flag == 2: | |
| 82 ## deletion to the ref | |
| 83 #eaten += length | |
| 84 if not end: | |
| 85 pos += length | |
| 86 else: | |
| 87 pos -= length | |
| 88 pass | |
| 89 if flag == 4: | |
| 90 eaten += length | |
| 91 if not end and pos >= start_pos and flag == 0: | |
| 92 break | |
| 93 if end and pos <= start_pos and flag == 0: | |
| 94 break | |
| 95 | |
| 96 #print >>sys.stderr, "pos:%s %s" % (pos, start_pos) | |
| 97 | |
| 98 extra = abs(pos - start_pos) | |
| 99 if args.verbose: | |
| 100 print("extra %s" % (extra), file=sys.stderr) | |
| 101 if extra: | |
| 102 if flag == 0: | |
| 103 if args.verbose: | |
| 104 print("Inserted a %s, %s" % (0, extra), file=sys.stderr) | |
| 105 | |
| 106 if end: | |
| 107 cigar.append((0, extra)) | |
| 108 else: | |
| 109 cigar.insert(0, (0, extra)) | |
| 110 eaten -= extra | |
| 111 | |
| 112 if not end: | |
| 113 s.pos = pos - extra | |
| 114 | |
| 115 if args.verbose: | |
| 116 print("New pos: %s" % (s.pos), file=sys.stderr) | |
| 117 | |
| 118 if end: | |
| 119 cigar.append((4, eaten)) | |
| 120 else: | |
| 121 cigar.insert(0, (4, eaten)) | |
| 122 oldcigarstring = s.cigarstring | |
| 123 s.cigartuples = cigar | |
| 124 | |
| 125 #print >>sys.stderr, s.query_name, oldcigarstring[0:50], s.cigarstring[0:50] | |
| 126 | |
| 127 def find_primer(bed, pos, direction): | |
| 128 from operator import itemgetter | |
| 129 | |
| 130 closest = min([(abs(p['start'] - pos), p['start'] - pos, p) for p in bed if p['direction'] == direction], key=itemgetter(0)) | |
| 131 return closest | |
| 132 | |
| 133 def is_correctly_paired(p1, p2): | |
| 134 name1 = p1[2]['Primer_ID'] | |
| 135 name2 = p2[2]['Primer_ID'] | |
| 136 | |
| 137 name1 = name1.replace('_LEFT', '') | |
| 138 name2 = name2.replace('_RIGHT', '') | |
| 139 | |
| 140 return name1 == name2 | |
| 141 | |
| 142 def go(args): | |
| 143 if args.report: | |
| 144 reportfh = open(args.report, "w") | |
| 145 print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh) | |
| 146 | |
| 147 bed = read_bed_file(args.bedfile) | |
| 148 | |
| 149 counter = defaultdict(int) | |
| 150 | |
| 151 infile = pysam.AlignmentFile("-", "rb") | |
| 152 outfile = pysam.AlignmentFile("-", "wh", template=infile) | |
| 153 for s in infile: | |
| 154 cigar = copy(s.cigartuples) | |
| 155 | |
| 156 ## logic - if alignment start site is _before_ but within X bases of | |
| 157 ## a primer site, trim it off | |
| 158 | |
| 159 if s.is_unmapped: | |
| 160 print("%s skipped as unmapped" % (s.query_name), file=sys.stderr) | |
| 161 continue | |
| 162 | |
| 163 if s.is_supplementary: | |
| 164 print("%s skipped as supplementary" % (s.query_name), file=sys.stderr) | |
| 165 continue | |
| 166 | |
| 167 p1 = find_primer(bed, s.reference_start, '+') | |
| 168 p2 = find_primer(bed, s.reference_end, '-') | |
| 169 | |
| 170 correctly_paired = is_correctly_paired(p1, p2) | |
| 171 | |
| 172 report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (s.query_name, s.reference_start, s.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs(p1[1]), p2[2]['Primer_ID'], abs(p2[1]), s.is_secondary, s.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired) | |
| 173 if args.report: | |
| 174 print(report, file=reportfh) | |
| 175 | |
| 176 if args.verbose: | |
| 177 print(report, file=sys.stderr) | |
| 178 | |
| 179 ## if the alignment starts before the end of the primer, trim to that position | |
| 180 | |
| 181 try: | |
| 182 if args.start: | |
| 183 primer_position = p1[2]['start'] | |
| 184 else: | |
| 185 primer_position = p1[2]['end'] | |
| 186 | |
| 187 if s.reference_start < primer_position: | |
| 188 trim(args, cigar, s, primer_position, 0) | |
| 189 else: | |
| 190 if args.verbose: | |
| 191 print("ref start %s >= primer_position %s" % (s.reference_start, primer_position), file=sys.stderr) | |
| 192 | |
| 193 if args.start: | |
| 194 primer_position = p2[2]['start'] | |
| 195 else: | |
| 196 primer_position = p2[2]['end'] | |
| 197 | |
| 198 if s.reference_end > primer_position: | |
| 199 trim(args, cigar, s, primer_position, 1) | |
| 200 else: | |
| 201 if args.verbose: | |
| 202 print("ref end %s >= primer_position %s" % (s.reference_end, primer_position), file=sys.stderr) | |
| 203 except Exception as e: | |
| 204 print("problem %s" % (e,), file=sys.stderr) | |
| 205 pass | |
| 206 | |
| 207 if args.normalise: | |
| 208 pair = "%s-%s-%d" % (p1[2]['Primer_ID'], p2[2]['Primer_ID'], s.is_reverse) | |
| 209 counter[pair] += 1 | |
| 210 | |
| 211 if counter[pair] > args.normalise: | |
| 212 continue | |
| 213 | |
| 214 if not check_still_matching_bases(s): | |
| 215 continue | |
| 216 | |
| 217 outfile.write(s) | |
| 218 | |
| 219 reportfh.close() | |
| 220 | |
| 221 def main(): | |
| 222 import argparse | |
| 223 | |
| 224 parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.') | |
| 225 parser.add_argument('bedfile', help='BED file containing the amplicon scheme') | |
| 226 parser.add_argument('--normalise', type=int, help='Subsample to n coverage per strand') | |
| 227 parser.add_argument('--report', type=str, help='Output report to file') | |
| 228 parser.add_argument('--start', action='store_true', help='Trim to start of primers instead of ends') | |
| 229 parser.add_argument('--verbose', action='store_true', help='Debug mode') | |
| 230 | |
| 231 args = parser.parse_args() | |
| 232 go(args) | |
| 233 | |
| 234 | |
| 235 if __name__ == "__main__": | |
| 236 main() |
