diff align_trim.py @ 0:defebd1f95b9 draft

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/artic_align_trim commit 374121821497c96c8450afda266951c2f431ba11-dirty"
author dfornika
date Tue, 10 Mar 2020 22:03:49 +0000
parents
children 26516cf26444
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/align_trim.py	Tue Mar 10 22:03:49 2020 +0000
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+
+# Written by Nick Loman
+# Originally part of the ZiBRA pipeline (zibraproject.org)
+# This file adapted from ARTICnetwork 'fieldbioinformatics' pipeline:
+# https://github.com/artic-network/fieldbioinformatics
+
+import sys
+
+from collections import defaultdict
+from copy import copy
+
+import pysam
+
+def read_bed_file(fn):
+    bedfile = []
+    with open(fn) as csvfile:
+        reader = csv.reader(csvfile, dialect='excel-tab')
+        for row in reader:
+            bedrow = {}
+            bedrow['Primer_ID'] = row[3]
+
+            if len(row) >= 6:
+                # new style bed
+                bedrow['direction'] = row[5]
+            elif len(row) == 5:
+                # old style without directory
+                if 'LEFT' in row[3]:
+                    bedrow['direction'] = '+'
+                elif 'RIGHT' in row[3]:
+                    bedrow['direction'] = '-'
+                else:
+                    print("Malformed BED file!", file=sys.stderr) 
+                    raise SystemExit
+
+            if bedrow['direction'] == '+':
+                bedrow['end'] = int(row[2])
+                bedrow['start'] = int(row[1])
+            else:
+                bedrow['end'] = int(row[1])
+                bedrow['start'] = int(row[2])
+            bedfile.append(bedrow)
+    return bedfile
+
+
+def check_still_matching_bases(s):
+    for flag, length in s.cigartuples:
+        if flag == 0:
+            return True
+    return False
+
+def trim(args, cigar, s, start_pos, end):
+    if not end:
+        pos = s.pos
+    else:
+        pos = s.reference_end
+
+    eaten = 0
+    while 1:
+        ## chomp stuff off until we reach pos
+        if end:
+            flag, length = cigar.pop()
+        else:
+            flag, length = cigar.pop(0)
+
+        if args.verbose:
+            print("Chomped a %s, %s" % (flag, length), file=sys.stderr)
+
+        if flag == 0:
+            ## match
+            #to_trim -= length
+            eaten += length
+            if not end:
+                 pos += length
+            else:
+                 pos -= length
+        if flag == 1:
+            ## insertion to the ref
+            #to_trim -= length
+            eaten += length
+        if flag == 2:
+            ## deletion to the ref
+            #eaten += length
+            if not end:
+                pos += length
+            else:
+                pos -= length
+            pass
+        if flag == 4:
+            eaten += length
+        if not end and pos >= start_pos and flag == 0:
+            break
+        if end and pos <= start_pos and flag == 0:
+            break
+
+        #print >>sys.stderr, "pos:%s %s" % (pos, start_pos)
+
+    extra = abs(pos - start_pos)
+    if args.verbose:
+        print("extra %s" % (extra), file=sys.stderr)
+    if extra:
+        if flag == 0:
+            if args.verbose:
+                print("Inserted a %s, %s" % (0, extra), file=sys.stderr)
+
+            if end:
+                cigar.append((0, extra))
+            else:
+                cigar.insert(0, (0, extra))
+            eaten -= extra
+
+    if not end:
+        s.pos = pos - extra
+
+    if args.verbose:
+        print("New pos: %s" % (s.pos), file=sys.stderr)
+
+    if end:
+        cigar.append((4, eaten))
+    else:
+        cigar.insert(0, (4, eaten))
+    oldcigarstring = s.cigarstring
+    s.cigartuples = cigar
+
+    #print >>sys.stderr,  s.query_name, oldcigarstring[0:50], s.cigarstring[0:50]
+
+def find_primer(bed, pos, direction):
+    from operator import itemgetter
+
+    closest = min([(abs(p['start'] - pos), p['start'] - pos, p) for p in bed if p['direction'] == direction], key=itemgetter(0))
+    return closest
+
+def is_correctly_paired(p1, p2):
+    name1 = p1[2]['Primer_ID']
+    name2 = p2[2]['Primer_ID']
+
+    name1 = name1.replace('_LEFT', '')
+    name2 = name2.replace('_RIGHT', '')
+
+    return name1 == name2
+
+def go(args):
+    if args.report:
+        reportfh = open(args.report, "w")
+        print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh)
+
+    bed = read_bed_file(args.bedfile)
+
+    counter = defaultdict(int)
+
+    infile = pysam.AlignmentFile("-", "rb")
+    outfile = pysam.AlignmentFile("-", "wh", template=infile)
+    for s in infile:
+        cigar = copy(s.cigartuples)
+
+        ## logic - if alignment start site is _before_ but within X bases of
+        ## a primer site, trim it off
+
+        if s.is_unmapped:
+            print("%s skipped as unmapped" % (s.query_name), file=sys.stderr)
+            continue
+
+        if s.is_supplementary:
+            print("%s skipped as supplementary" % (s.query_name), file=sys.stderr)
+            continue
+
+        p1 = find_primer(bed, s.reference_start, '+')
+        p2 = find_primer(bed, s.reference_end, '-')
+
+        correctly_paired = is_correctly_paired(p1, p2)
+
+        report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (s.query_name, s.reference_start, s.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs(p1[1]), p2[2]['Primer_ID'], abs(p2[1]), s.is_secondary, s.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired)
+        if args.report:
+            print(report, file=reportfh)
+
+        if args.verbose:
+            print(report, file=sys.stderr)
+
+        ## if the alignment starts before the end of the primer, trim to that position
+
+        try:
+            if args.start:
+                primer_position = p1[2]['start']
+            else:
+                primer_position = p1[2]['end']
+
+            if s.reference_start < primer_position:
+                trim(args, cigar, s, primer_position, 0)
+            else:
+                if args.verbose:
+                    print("ref start %s >= primer_position %s" % (s.reference_start, primer_position), file=sys.stderr)
+
+            if args.start:
+                primer_position = p2[2]['start']
+            else:
+                primer_position = p2[2]['end']
+
+            if s.reference_end > primer_position:
+                trim(args, cigar, s, primer_position, 1)
+            else:
+                if args.verbose:
+                    print("ref end %s >= primer_position %s" % (s.reference_end, primer_position), file=sys.stderr)
+        except Exception as e:
+            print("problem %s" % (e,), file=sys.stderr)
+            pass
+
+        if args.normalise:
+            pair = "%s-%s-%d" % (p1[2]['Primer_ID'], p2[2]['Primer_ID'], s.is_reverse)
+            counter[pair] += 1
+
+            if counter[pair] > args.normalise:
+                continue
+
+        if not check_still_matching_bases(s):
+             continue
+
+        outfile.write(s)
+
+    reportfh.close()
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.')
+    parser.add_argument('bedfile', help='BED file containing the amplicon scheme')
+    parser.add_argument('--normalise', type=int, help='Subsample to n coverage per strand')
+    parser.add_argument('--report', type=str, help='Output report to file')
+    parser.add_argument('--start', action='store_true', help='Trim to start of primers instead of ends')
+    parser.add_argument('--verbose', action='store_true', help='Debug mode')
+
+    args = parser.parse_args()
+    go(args)
+
+
+if __name__ == "__main__":
+    main()