# HG changeset patch
# User mvdbeek
# Date 1486542965 18000
# Node ID 8138277ea0e5ba694223acb88db3220bf962743b
# Parent 66aa515e360ea02bf9dfc4ef2aa57c5bba69a9c6
planemo upload for repository https://github.com/bardin-lab/tag_reads/tree/master/galaxy commit e4243f5ba5d89a06c1ae68cc49d830f1b6e9cb46
diff -r 66aa515e360e -r 8138277ea0e5 allow_dovetailing.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/allow_dovetailing.xml Wed Feb 08 03:36:05 2017 -0500
@@ -0,0 +1,34 @@
+
+ modifies proper_pair flag in bam files
+
+ tag_reads
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 66aa515e360e -r 8138277ea0e5 bam_tag_reads.xml
--- a/bam_tag_reads.xml Sun Feb 05 14:27:17 2017 -0500
+++ b/bam_tag_reads.xml Wed Feb 08 03:36:05 2017 -0500
@@ -1,16 +1,17 @@
-
+
from multiple bam files
macros.xml
- tag_reads
+ tag_reads
@@ -24,6 +25,7 @@
+
diff -r 66aa515e360e -r 8138277ea0e5 bam_tag_reads.xml.bak
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bam_tag_reads.xml.bak Wed Feb 08 03:36:05 2017 -0500
@@ -0,0 +1,63 @@
+
+ from multiple bam files
+
+ macros.xml
+
+
+ tag_reads
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 66aa515e360e -r 8138277ea0e5 tag_reads.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tag_reads.py Wed Feb 08 03:36:05 2017 -0500
@@ -0,0 +1,205 @@
+import argparse
+import pysam
+import six
+
+
+class SamTagProcessor(object):
+ def __init__(self, source_path, tag_prefix_self, tag_prefix_mate, tag_mate=True):
+ self.tag_mate = tag_mate
+ self.tag_prefix_self = tag_prefix_self
+ self.tag_prefix_mate = tag_prefix_mate
+ self.source_alignment = pysam.AlignmentFile(source_path)
+ self.result = self.process_source()
+ if self.tag_mate:
+ self.add_mate()
+
+ def compute_tag(self, r):
+ """
+ Adds tags for downstream processing:
+ - Reference
+ - Pos
+ - Sense
+ - Aligned position
+ - MD
+ Returns a tag of the form:
+ [('AA'), ('R:gypsy,POS:1,MD:107S35M,S:AS')]
+ 'AA' stands for alternative alignment,
+ 'MA' stands for mate alignment.
+ :param r: AlignedSegment
+ """
+ tags = dict(ref=self.source_alignment.get_reference_name(r.tid),
+ pos=r.reference_start,
+ cigar=r.cigarstring,
+ sense='AS' if r.is_reverse else 'S')
+ detail_tag_value = "R:{ref},POS:{pos},CIGAR:{cigar},S:{sense}".format(**tags)
+ ref_tag_value = tags['ref']
+ detail_tag = "%sD" % self.tag_prefix_self
+ ref_tag = "%sR" % self.tag_prefix_self
+ return [(detail_tag, detail_tag_value), (ref_tag, ref_tag_value)]
+
+ def is_taggable(self, r):
+ """
+ Decide if a read should be tagged
+ :param r: AlignedSegment
+ :type r: pysam.Alignedread
+ """
+ return not r.is_qcfail and not r.is_secondary and not r.is_supplementary and not r.is_unmapped
+
+ def process_source(self):
+ tag_d = {}
+ for r in self.source_alignment:
+ if self.is_taggable(r):
+ fw_rev = 'r1' if r.is_read1 else 'r2'
+ if r.query_name in tag_d:
+ tag_d[r.query_name][fw_rev] = self.compute_tag(r)
+ else:
+ tag_d[r.query_name] = {fw_rev: self.compute_tag(r)}
+ return tag_d
+
+ def add_mate(self):
+ detail_tag = "%sD" % self.tag_prefix_mate
+ ref_tag = "%sR" % self.tag_prefix_mate
+ for qname, tag_d in six.iteritems(self.result):
+ if len(tag_d) == 2:
+ # Both mates aligned, add mate tag
+ tag_d['r1'].extend([(detail_tag, tag_d['r2'][0][1]), (ref_tag, tag_d['r2'][1][1])])
+ tag_d['r2'].extend([(detail_tag, tag_d['r1'][0][1]), (ref_tag, tag_d['r1'][1][1])])
+ elif len(tag_d) == 1:
+ # Only one of the mates mapped, so we fill the mate with its mate tag
+ if 'r1' in tag_d:
+ tag_d['r2'] = [(detail_tag, tag_d['r1'][0][1]), (ref_tag, tag_d['r1'][1][1])]
+ else:
+ tag_d['r1'] = [(detail_tag, tag_d['r2'][0][1]), (ref_tag, tag_d['r2'][1][1])]
+ else:
+ continue
+ self.result[qname] = tag_d
+
+ def get_tag(self, other_r):
+ """
+ convinience method that takes a read object `other_r` and fetches the
+ annotation tag that has been processed in the instance
+ """
+ other_r_reverse = 'r1' if other_r.is_read1 else 'r2'
+ tagged_mates = self.result.get(other_r.query_name)
+ if tagged_mates:
+ return tagged_mates[other_r_reverse]
+ else:
+ return None
+
+class SamAnnotator(object):
+ def __init__(self, annotate_file, samtags, output_path="test.bam", allow_dovetailing=False):
+ """
+ Compare `samtags` with `annotate_file`.
+ Produces a new alignment file at output_path.
+ :param annotate_file: 'Path to bam/sam file'
+ :type annotate_file: str
+ :param samtags: list of SamTagProcessor instances
+ :type samtags: List[SamTagProcessor]
+ :param allow_dovetailing: Controls whether or not dovetailing should be allowed
+ :type allow_dovetailing: bool
+ """
+ self.annotate_file = pysam.AlignmentFile(annotate_file)
+ self.output = pysam.AlignmentFile(output_path, 'wb', template=self.annotate_file)
+ self.samtags = samtags
+ self.process(allow_dovetailing)
+
+ def process(self, allow_dovetailing=False):
+ if allow_dovetailing:
+ max_proper_size = self.get_max_proper_pair_size(self.annotate_file)
+ for read in self.annotate_file:
+ for samtag in self.samtags:
+ annotated_tag = samtag.get_tag(read)
+ if annotated_tag:
+ read.tags = annotated_tag + read.tags
+ if allow_dovetailing:
+ read = self.allow_dovetailing(read, max_proper_size)
+ self.output.write(read)
+ self.output.close()
+
+ @classmethod
+ def get_max_proper_pair_size(cls, alignment_file):
+ """
+ iterate over the first 1000 properly paired records in alignment_file
+ and get the maximum valid isize for a proper pair.
+ :param alignment_file: pysam.AlignmentFile
+ :type alignment_file: pysam.AlignmentFile
+ :rtype int
+ """
+ isize = []
+ for r in alignment_file:
+ if r.is_proper_pair and not r.is_secondary and not r.is_supplementary:
+ isize.append(abs(r.isize))
+ if len(isize) == 1000:
+ alignment_file.reset()
+ return max(isize)
+ alignment_file.reset()
+ return max(isize)
+
+ @classmethod
+ def allow_dovetailing(cls, read, max_proper_size=351):
+ """
+ Manipulates is_proper_pair tag to allow dovetailing of reads.
+ Precondition is read and mate have the same reference id, are within the maximum proper pair distance
+ and are either in FR or RF orientation.
+ :param read: aligned segment of pysam.AlignmentFile
+ :type read: pysam.AlignedSegment
+ :rtype pysam.AlignedSegment
+ """
+ if not read.is_proper_pair and not read.is_reverse == read.mate_is_reverse and read.reference_id == read.mrnm and abs(read.isize) <= max_proper_size:
+ read.is_proper_pair = True
+ return read
+
+
+def parse_file_tags(filetags):
+ """
+ :param filetags: string with filepath.
+ optionally appended by the first letter that should be used for read and mate
+ :return: annotate_with, tag_prefix, tag_prefix_mate
+
+ >>> filetags = ['file_a:A:B', 'file_b:C:D', 'file_c']
+ >>> annotate_with, tag_prefix, tag_prefix_mate = parse_file_tags(filetags)
+ >>> annotate_with == ['file_a', 'file_b', 'file_c'] and tag_prefix == ['A', 'C', 'R'] and tag_prefix_mate == ['B', 'D', 'M']
+ True
+ >>>
+ """
+ annotate_with = []
+ tag_prefix = []
+ tag_prefix_mate = []
+ for filetag in filetags:
+ if ':' in filetag:
+ filepath, tag, tag_mate = filetag.split(':')
+ annotate_with.append(filepath)
+ tag_prefix.append(tag.upper())
+ tag_prefix_mate.append(tag_mate.upper())
+ else:
+ annotate_with.append(filetag)
+ tag_prefix.append('R') # Default is R for read, M for mate
+ tag_prefix_mate.append('M')
+ return annotate_with, tag_prefix, tag_prefix_mate
+
+
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Tag reads in an alignment file based on other alignment files",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ p.add_argument('-t', '--tag_file', help="Tag reads in this file.", required=True)
+ p.add_argument('-a', '--annotate_with',
+ help="Tag reads in readfile if reads are aligned in these files."
+ "Append `:A:B` to tag first letter of tag describing read as A, "
+ "and first letter of tag describing the mate as B",
+ nargs = "+",
+ required=True)
+ p.add_argument('-o', '--output_file', help="Write bam file to this path", required=True)
+ p.add_argument('-d', '--allow_dovetailing',
+ action='store_true',
+ help="Sets the proper pair flag (0x0002) to true if reads dovetail [reads reach into or surpass the mate sequence].")
+ return p.parse_args()
+
+def main():
+ args = parse_args()
+ files_tags = zip(*parse_file_tags(args.annotate_with))
+ samtags = [SamTagProcessor(filepath, tag_prefix_self=tag, tag_prefix_mate=tag_mate) for (filepath, tag, tag_mate) in files_tags ]
+ SamAnnotator(annotate_file=args.tag_file, samtags=samtags, output_path=args.output_file)
+
+if __name__ == "__main__":
+ main()
diff -r 66aa515e360e -r 8138277ea0e5 test.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test.sh Wed Feb 08 03:36:05 2017 -0500
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -e
+
+if grep -v 'python tag_reads.py' bam_tag_reads.xml
+then
+ sed -i.bak 's/tag_reads -t/python \$__tool_directory__\/tag_reads.py -t/g' bam_tag_reads.xml
+fi
+cp ../tag_reads/tag_reads.py .
+planemo test --conda_prefix ~/miniconda3