diff BEDTools-Version-2.14.3/src/shuffleBed/shuffleBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/shuffleBed/shuffleBed.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,244 @@
+/*****************************************************************************
+  shuffleBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "shuffleBed.h"
+
+
+BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile, 
+                       bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom, 
+                       float overlapFraction, int seed) {
+
+    _bedFile         = bedFile;
+    _genomeFile      = genomeFile;
+    _excludeFile     = excludeFile;
+    _includeFile     = includeFile;
+    _sameChrom       = sameChrom;
+    _haveExclude     = haveExclude;
+    _haveInclude     = haveInclude;
+    _overlapFraction = overlapFraction;
+    _haveSeed        = haveSeed;
+
+
+    // use the supplied seed for the random
+    // number generation if given.  else,
+    // roll our own.
+    if (_haveSeed) {
+        _seed = seed;
+        srand(seed);
+    }
+    else {
+        // thanks to Rob Long for the tip.
+        _seed = (unsigned)time(0)+(unsigned)getpid();
+        srand(_seed);
+    }
+
+    _bed         = new BedFile(bedFile);
+    _genome      = new GenomeFile(genomeFile);
+    _chroms      = _genome->getChromList();
+    _numChroms   = _genome->getNumberOfChroms();
+
+    if (_haveExclude) {
+        _exclude = new BedFile(excludeFile);
+        _exclude->loadBedFileIntoMap();
+    }
+    
+    if (_haveInclude) {
+        _include = new BedFile(includeFile);
+        _include->loadBedFileIntoMapNoBin();
+        
+        _numIncludeChroms = 0;
+        masterBedMapNoBin::const_iterator it    = _include->bedMapNoBin.begin(); 
+        masterBedMapNoBin::const_iterator itEnd = _include->bedMapNoBin.end();
+        for(; it != itEnd; ++it) {
+            _includeChroms.push_back(it->first);
+            _numIncludeChroms++;
+        }
+    }
+
+    if (_haveExclude == true && _haveInclude == false)
+        ShuffleWithExclusions();
+    else if  (_haveExclude == false && _haveInclude == true)
+        ShuffleWithInclusions();
+    else
+        Shuffle();
+}
+
+
+BedShuffle::~BedShuffle(void) {
+
+}
+
+
+void BedShuffle::Shuffle() {
+
+    int lineNum = 0;
+    BED bedEntry, nullBed;     // used to store the current BED line from the BED file.
+    BedLineStatus bedStatus;
+
+    _bed->Open();
+    while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            ChooseLocus(bedEntry);
+            _bed->reportBedNewLine(bedEntry);
+            bedEntry = nullBed;
+        }
+    }
+    _bed->Close();
+}
+
+
+
+void BedShuffle::ShuffleWithExclusions() {
+
+    int lineNum = 0;
+    BED bedEntry, nullBed;     // used to store the current BED line from the BED file.
+    BedLineStatus bedStatus;
+
+    _bed->Open();
+    while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            // keep looking as long as the chosen
+            // locus happens to overlap with regions
+            // that the user wishes to exclude.
+            int  tries = 0;
+            bool haveOverlap = false;
+            do 
+            {
+                // choose a new locus
+                ChooseLocus(bedEntry);
+                haveOverlap = _exclude->FindOneOrMoreOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end,
+                                                                    bedEntry.strand, false, _overlapFraction);
+                tries++;
+            } while ((haveOverlap == true) && (tries <= MAX_TRIES));
+            
+
+            if (tries > MAX_TRIES) {
+                cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions.  Ignoring entry and moving on." << endl;
+            }
+            else {
+                _bed->reportBedNewLine(bedEntry);
+            }
+        }
+        bedEntry = nullBed;
+    }
+    _bed->Close();
+}
+
+
+void BedShuffle::ShuffleWithInclusions() {
+
+    int lineNum = 0;
+    BED bedEntry, nullBed;     // used to store the current BED line from the BED file.
+    BedLineStatus bedStatus;
+
+    _bed->Open();
+    while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            // choose a new locus
+            ChooseLocusFromInclusionFile(bedEntry);
+            _bed->reportBedNewLine(bedEntry);
+        }
+        bedEntry = nullBed;
+    }
+    _bed->Close();
+}
+
+
+void BedShuffle::ChooseLocus(BED &bedEntry) {
+
+    string chrom = bedEntry.chrom;
+    CHRPOS start    = bedEntry.start;
+    CHRPOS end      = bedEntry.end;
+    CHRPOS length   = end - start;
+
+    string randomChrom;
+    CHRPOS randomStart;
+    CHRPOS chromSize;
+
+    if (_sameChrom == false) {
+        randomChrom    = _chroms[rand() % _numChroms];
+        chromSize      = _genome->getChromSize(randomChrom);
+        randomStart    = rand() % chromSize;
+        bedEntry.chrom = randomChrom;
+        bedEntry.start = randomStart;
+        bedEntry.end   = randomStart + length;
+    }
+    else {
+        chromSize      = _genome->getChromSize(chrom);
+        randomStart    = rand() % chromSize;
+        bedEntry.start = randomStart;
+        bedEntry.end   = randomStart + length;
+    }
+
+    // ensure that the chosen location doesn't go past
+    // the length of the chromosome. if so, keep looking
+    // for a new spot.
+    while (bedEntry.end > chromSize) {
+        if (_sameChrom == false) {
+            randomChrom    = _chroms[rand() % _numChroms];
+            chromSize      = _genome->getChromSize(randomChrom);
+            randomStart    = rand() % chromSize;
+            bedEntry.chrom = randomChrom;
+            bedEntry.start = randomStart;
+            bedEntry.end   = randomStart + length;
+        }
+        else {
+            chromSize      = _genome->getChromSize(chrom);
+            randomStart    = rand() % chromSize;
+            bedEntry.start = randomStart;
+            bedEntry.end   = randomStart + length;
+        }
+    }
+}
+
+
+void BedShuffle::ChooseLocusFromInclusionFile(BED &bedEntry) {
+
+    string chrom    = bedEntry.chrom;
+    CHRPOS length   = bedEntry.end - bedEntry.start;
+
+    string randomChrom;
+    CHRPOS randomStart;
+    BED includeInterval;
+    
+    if (_sameChrom == false) {
+
+        // grab a random chromosome from the inclusion file.
+        randomChrom            = _includeChroms[rand() % _numIncludeChroms];
+        // get the number of inclusion intervals for that chrom
+        size_t size            =  _include->bedMapNoBin[randomChrom].size();
+        // grab a random interval on the chosen chromosome.
+        size_t interval        = rand() % size;
+        // retreive a ranom -incl interval on the selected chrom
+        includeInterval        = _include->bedMapNoBin[randomChrom][interval];
+
+        bedEntry.chrom = randomChrom;        
+    }
+    else {
+        // get the number of inclusion intervals for the original chrom
+        size_t size =  _include->bedMapNoBin[chrom].size();
+        // grab a random interval on the chosen chromosome.
+        includeInterval       = _include->bedMapNoBin[chrom][rand() % size];
+    }
+    
+    randomStart    = includeInterval.start + rand() % (includeInterval.size());
+    bedEntry.start = randomStart;
+    bedEntry.end   = randomStart + length;
+    
+    // use recursion to ensure that the chosen location 
+    // doesn't go past the end of the chrom
+    if (bedEntry.end > ((size_t) _genome->getChromSize(chrom))) {
+        //bedEntry.end = _genome->getChromSize(chrom);
+        ChooseLocusFromInclusionFile(bedEntry);
+    }
+}
+