diff BEDTools-Version-2.14.3/src/pairToBed/pairToBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/pairToBed/pairToBed.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,525 @@
+/*****************************************************************************
+  pairToBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "pairToBed.h"
+
+
+bool IsCorrectMappingForBEDPE (const BamAlignment &bam) {
+
+    if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) {
+        return true;
+    }
+    else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) {
+        return true;
+    }
+    else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) {
+        return true;
+    }
+    else return false;
+}
+
+
+/*
+    Constructor
+*/
+
+
+BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction,
+                               string searchType, bool sameStrand, bool diffStrand, bool bamInput,
+                               bool bamOutput,  bool uncompressedBam, bool useEditDistance) {
+
+    _bedAFilePE        = bedAFilePE;
+    _bedBFile          = bedBFile;
+    _overlapFraction   = overlapFraction;
+    _sameStrand        = sameStrand;
+    _diffStrand        = diffStrand;
+    _useEditDistance   = useEditDistance;
+    _searchType        = searchType;
+    _bamInput          = bamInput;
+    _bamOutput         = bamOutput;
+    _isUncompressedBam = uncompressedBam;
+
+    _bedA = new BedFilePE(bedAFilePE);
+    _bedB = new BedFile(bedBFile);
+
+    if (_bamInput == false)
+        IntersectBedPE();
+    else
+        IntersectBamPE(_bedAFilePE);
+}
+
+
+/*
+    Destructor
+*/
+
+BedIntersectPE::~BedIntersectPE(void) {
+}
+
+
+
+void BedIntersectPE::FindOverlaps(const BEDPE &a, vector<BED> &hits1, vector<BED> &hits2, const string &type) {
+
+    // list of hits on each end of BEDPE
+    // that exceed the requested overlap fraction
+    vector<BED> qualityHits1;
+    vector<BED> qualityHits2;
+
+    // count of hits on each end of BEDPE
+    // that exceed the requested overlap fraction
+    int numOverlapsEnd1 = 0;
+    int numOverlapsEnd2 = 0;
+
+    // make sure we have a valid chromosome before we search
+    if (a.chrom1 != ".") {
+        // Find the quality hits between ***end1*** of the BEDPE and the B BED file
+        _bedB->FindOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, hits1, _sameStrand, _diffStrand);
+
+        vector<BED>::const_iterator h = hits1.begin();
+        vector<BED>::const_iterator hitsEnd = hits1.end();
+        for (; h != hitsEnd; ++h) {
+
+            int s = max(a.start1, h->start);
+            int e = min(a.end1, h->end);
+            int overlapBases = (e - s);             // the number of overlapping bases b/w a and b
+            int aLength = (a.end1 - a.start1);      // the length of a in b.p.
+
+            // is there enough overlap relative to the user's request? (default ~ 1bp)
+            if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
+                numOverlapsEnd1++;
+
+                if (type == "either") {
+                    _bedA->reportBedPETab(a);
+                    _bedB->reportBedNewLine(*h);
+                }
+                else {
+                    qualityHits1.push_back(*h);
+                }
+            }
+        }
+    }
+
+
+    // make sure we have a valid chromosome before we search
+    if (a.chrom2 != ".") {
+        // Now find the quality hits between ***end2*** of the BEDPE and the B BED file
+        _bedB->FindOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, hits2, _sameStrand, _diffStrand);
+
+        vector<BED>::const_iterator h = hits2.begin();
+        vector<BED>::const_iterator hitsEnd = hits2.end();
+        for (; h != hitsEnd; ++h) {
+
+            int s = max(a.start2, h->start);
+            int e = min(a.end2, h->end);
+            int overlapBases = (e - s);             // the number of overlapping bases b/w a and b
+            int aLength = (a.end2 - a.start2);      // the length of a in b.p.
+
+            // is there enough overlap relative to the user's request? (default ~ 1bp)
+            if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
+                numOverlapsEnd2++;
+
+                if (type == "either") {
+                    _bedA->reportBedPETab(a);
+                    _bedB->reportBedNewLine(*h);
+                }
+                else {
+                    qualityHits2.push_back(*h);
+                }
+            }
+        }
+    }
+
+    // Now report the hits depending on what the user has requested.
+    if (type == "neither") {
+        if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) {
+            _bedA->reportBedPENewLine(a);
+        }
+    }
+    else if (type == "notboth") {
+        if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) {
+            _bedA->reportBedPENewLine(a);
+        }
+        else if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) {
+            for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*q);
+            }
+        }
+        else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) {
+            for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*q);
+            }
+        }
+    }
+    else if (type == "xor") {
+        if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) {
+            for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*q);
+            }
+        }
+        else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) {
+            for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*q);
+            }
+        }
+    }
+    else if (type == "both") {
+        if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) {
+            for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*q);
+            }
+            for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*q);
+            }
+        }
+    }
+}
+
+
+bool BedIntersectPE::FindOneOrMoreOverlaps(const BEDPE &a, const string &type) {
+
+    // flags for the existence of hits on each end of BEDPE
+    // that exceed the requested overlap fraction
+    bool end1Found = false;
+    bool end2Found = false;
+
+    // Look for overlaps in end 1 assuming we have an aligned chromosome.
+    if (a.chrom1 != ".") {
+        end1Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1,
+            _sameStrand, _diffStrand, _overlapFraction);
+
+        // can we bail out without checking end2?
+        if ((type == "either") && (end1Found == true)) return true;
+        else if ((type == "neither") && (end1Found == true)) return false;
+        else if ((type == "notboth") && (end1Found == false)) return true;
+        else if ((type == "both") && (end1Found == false)) return false;
+    }
+
+    // Now look for overlaps in end 2 assuming we have an aligned chromosome.
+    if (a.chrom2 != ".") {
+        end2Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2,
+            _sameStrand, _diffStrand, _overlapFraction);
+
+        if ((type == "either") && (end2Found == true)) return true;
+        else if ((type == "neither") && (end2Found == true)) return false;
+        else if ((type == "notboth") && (end2Found == false)) return true;
+        else if ((type == "both") && (end2Found == false)) return false;
+    }
+
+    // Now report the hits depending on what the user has requested.
+    if (type == "notboth") {
+        if ( (end1Found == false) || (end2Found == false) ) return true;
+        else return false;
+    }
+    else if (type == "either") {
+        if ( (end1Found == false) && (end2Found == false) ) return false;
+    }
+    else if (type == "neither") {
+        if ( (end1Found == false) && (end2Found == false) ) return true;
+        else return false;
+    }
+    else if (type == "xor") {
+        if ( (end1Found == true) && (end2Found == false) ) return true;
+        else if ( (end1Found == false) && (end2Found == true) ) return true;
+        else return false;
+    }
+    else if (type == "both") {
+        if ( (end1Found == true) && (end2Found == true) ) return true;
+        return false;
+    }
+    return false;
+}
+
+
+void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type) {
+
+    // count of hits on _between_ end of BEDPE
+    // that exceed the requested overlap fraction
+    int numOverlaps = 0;
+    CHRPOS spanStart = 0;
+    CHRPOS spanEnd = 0;
+    CHRPOS spanLength = 0;
+
+    if ((type == "ispan") || (type == "notispan")) {
+        spanStart = a.end1;
+        spanEnd = a.start2;
+        if (a.end1 > a.start2) {
+            spanStart = a.end2;
+            spanEnd = a.start1;
+        }
+    }
+    else if ((type == "ospan") || (type == "notospan")) {
+        spanStart = a.start1;
+        spanEnd = a.end2;
+        if (a.start1 > a.start2) {
+            spanStart = a.start2;
+            spanEnd = a.end1;
+        }
+    }
+    spanLength = spanEnd - spanStart;
+
+    // get the hits for the span
+    _bedB->FindOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, hits, _sameStrand, _diffStrand);
+
+    vector<BED>::const_iterator h = hits.begin();
+    vector<BED>::const_iterator hitsEnd = hits.end();
+    for (; h != hitsEnd; ++h) {
+
+        int s = max(spanStart, h->start);
+        int e = min(spanEnd, h->end);
+        int overlapBases = (e - s);                     // the number of overlapping bases b/w a and b
+        int spanLength = (spanEnd - spanStart);     // the length of a in b.p.
+
+        // is there enough overlap relative to the user's request? (default ~ 1bp)
+        if ( ( (float) overlapBases / (float) spanLength ) >= _overlapFraction ) {
+            numOverlaps++;
+            if ((type == "ispan") || (type == "ospan")) {
+                _bedA->reportBedPETab(a);
+                _bedB->reportBedNewLine(*h);
+            }
+        }
+    }
+
+    if ( ( (type == "notispan") || (type == "notospan") ) && numOverlaps == 0 ) {
+        _bedA->reportBedPENewLine(a);
+    }
+}
+
+
+bool BedIntersectPE::FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type) {
+
+    int spanStart = 0;
+    int spanEnd = 0;
+    int spanLength = 0;
+    bool overlapFound;
+
+    if ((type == "ispan") || (type == "notispan")) {
+        spanStart = a.end1;
+        spanEnd = a.start2;
+        if (a.end1 > a.start2) {
+            spanStart = a.end2;
+            spanEnd = a.start1;
+        }
+    }
+    else if ((type == "ospan") || (type == "notospan")) {
+        spanStart = a.start1;
+        spanEnd = a.end2;
+        if (a.start1 > a.start2) {
+            spanStart = a.start2;
+            spanEnd = a.end1;
+        }
+    }
+    spanLength = spanEnd - spanStart;
+
+    overlapFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1,
+        _sameStrand, _diffStrand, _overlapFraction);
+
+    return overlapFound;
+}
+
+
+void BedIntersectPE::IntersectBedPE() {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bedB->loadBedFileIntoMap();
+
+    int lineNum = 0;                    // current input line number
+    vector<BED> hits, hits1, hits2;     // vector of potential hits
+
+    // reserve some space
+    hits.reserve(100);
+    hits1.reserve(100);
+    hits2.reserve(100);
+
+    BEDPE a, nullBedPE;
+    BedLineStatus bedStatus;
+
+    _bedA->Open();
+    while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            if ( (_searchType == "ispan") || (_searchType == "ospan") ||
+                 (_searchType == "notispan") || (_searchType == "notospan") ) {
+                if (a.chrom1 == a.chrom2) {
+                    FindSpanningOverlaps(a, hits, _searchType);
+                    hits.clear();
+                }
+            }
+            else {
+                FindOverlaps(a, hits1, hits2, _searchType);
+                hits1.clear();
+                hits2.clear();
+            }
+            a = nullBedPE;
+        }
+    }
+    _bedA->Close();
+}
+
+
+void BedIntersectPE::IntersectBamPE(string bamFile) {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bedB->loadBedFileIntoMap();
+
+    // open the BAM file
+    BamReader reader;
+    BamWriter writer;
+    reader.Open(bamFile);
+
+    // get header & reference information
+    string bamHeader = reader.GetHeaderText();
+    RefVector refs   = reader.GetReferenceData();
+
+    // open a BAM output to stdout if we are writing BAM
+    if (_bamOutput == true) {
+        // set compression mode
+        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
+        writer.SetCompressionMode(compressionMode);
+        // open our BAM writer
+        writer.Open("stdout", bamHeader, refs);
+    }
+
+    // track the previous and current sequence
+    // names so that we can identify blocks of
+    // alignments for a given read ID.
+    string prevName, currName;
+    prevName = currName = "";
+
+    vector<BamAlignment> alignments;        // vector of BAM alignments for a given ID in a BAM file.
+    alignments.reserve(100);
+
+    _bedA->bedType = 10;                    // it's a full BEDPE given it's BAM
+
+    // rip through the BAM file and convert each mapped entry to BEDPE
+    BamAlignment bam1, bam2;
+    while (reader.GetNextAlignment(bam1)) {
+        // the alignment must be paired
+        if (bam1.IsPaired() == true) {
+            // grab the second alignment for the pair.
+            reader.GetNextAlignment(bam2);
+
+            // require that the alignments are from the same query
+            if (bam1.Name == bam2.Name) {
+                ProcessBamBlock(bam1, bam2, refs, writer);
+            }
+            else {
+                cerr << "*****ERROR: BAM must be sorted by query name. " << endl;
+                exit(1);
+            }
+        }
+    }
+    // close up
+    reader.Close();
+    if (_bamOutput == true) {
+        writer.Close();
+    }
+}
+
+
+void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2,
+                                      const RefVector &refs, BamWriter &writer) {
+
+    vector<BED> hits, hits1, hits2;         // vector of potential hits
+    hits.reserve(1000);                     // reserve some space
+    hits1.reserve(1000);
+    hits2.reserve(1000);
+
+    bool overlapsFound;                     // flag to indicate if overlaps were found
+
+    if ( (_searchType == "either") || (_searchType == "xor") ||
+              (_searchType == "both") || (_searchType == "notboth") ||
+              (_searchType == "neither") ) {
+
+        // create a new BEDPE feature from the BAM alignments.
+        BEDPE a;
+        ConvertBamToBedPE(bam1, bam2, refs, a);
+        if (_bamOutput == true) {   // BAM output
+            // write to BAM if correct hits found
+            overlapsFound = FindOneOrMoreOverlaps(a, _searchType);
+            if (overlapsFound == true) {
+                writer.SaveAlignment(bam1);
+                writer.SaveAlignment(bam2);
+            }
+        }
+        else {  // BEDPE output
+            FindOverlaps(a, hits1, hits2, _searchType);
+            hits1.clear();
+            hits2.clear();
+        }
+    }
+    else if ( (_searchType == "ispan") || (_searchType == "ospan") ) {
+        // only look for ispan and ospan when both ends are mapped.
+        if (bam1.IsMapped() && bam2.IsMapped()) {
+            // only do an inspan or outspan check if the alignment is intrachromosomal
+            if (bam1.RefID == bam2.RefID) {
+                // create a new BEDPE feature from the BAM alignments.
+                BEDPE a;
+                ConvertBamToBedPE(bam1, bam2, refs, a);
+                if (_bamOutput == true) {   // BAM output
+                    // look for overlaps, and write to BAM if >=1 were found
+                    overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
+                    if (overlapsFound == true) {
+                        writer.SaveAlignment(bam1);
+                        writer.SaveAlignment(bam2);
+                    }
+                }
+                else {  // BEDPE output
+                    FindSpanningOverlaps(a, hits, _searchType);
+                    hits.clear();
+                }
+            }
+        }
+    }
+    else if ( (_searchType == "notispan") || (_searchType == "notospan") ) {
+        // only look for notispan and notospan when both ends are mapped.
+        if (bam1.IsMapped() && bam2.IsMapped()) {
+            // only do an inspan or outspan check if the alignment is intrachromosomal
+            if (bam1.RefID == bam2.RefID) {
+                // create a new BEDPE feature from the BAM alignments.
+                BEDPE a;
+                ConvertBamToBedPE(bam1, bam2, refs, a);
+                if (_bamOutput == true) {   // BAM output
+                    // write to BAM if there were no overlaps
+                    overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
+                    if (overlapsFound == false) {
+                        writer.SaveAlignment(bam1);
+                        writer.SaveAlignment(bam2);
+                    }
+                }
+                else {  // BEDPE output
+                    FindSpanningOverlaps(a, hits, _searchType);
+                    hits.clear();
+                }
+            }
+            // if inter-chromosomal or orphaned, we know it's not ispan and not ospan
+            else if (_bamOutput == true) {
+                writer.SaveAlignment(bam1);
+                writer.SaveAlignment(bam2);
+            }
+        }
+        // if both ends aren't mapped, we know that it's notispan and not ospan
+        else if (_bamOutput == true) {
+            writer.SaveAlignment(bam1);
+            writer.SaveAlignment(bam2);
+        }
+    }
+}
+
+