Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,720 @@ +/***************************************************************************** + bedFile.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "bedFile.h" + + +/************************************************ +Helper functions +*************************************************/ +void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { + + if (bed.otherFields.size() < 6) { + cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl; + exit(1); + } + + int blockCount = atoi(bed.otherFields[3].c_str()); + if ( blockCount <= 0 ) { + cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl; + exit(1); + } + else if ( blockCount == 1 ) { + //take a short-cut for single blocks + bedBlocks.push_back(bed); + } + else { + // get the comma-delimited strings for the BED12 block starts and block ends. + string blockSizes(bed.otherFields[4]); + string blockStarts(bed.otherFields[5]); + + vector<int> sizes; + vector<int> starts; + Tokenize(blockSizes, sizes, ","); + Tokenize(blockStarts, starts, ","); + + if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { + cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl; + exit(1); + } + + // add each BED block to the bedBlocks vector + for (UINT i = 0; i < (UINT) blockCount; ++i) { + CHRPOS blockStart = bed.start + starts[i]; + CHRPOS blockEnd = bed.start + starts[i] + sizes[i]; + BED currBedBlock(bed.chrom, blockStart, blockEnd, bed.name, bed.score, bed.strand, bed.otherFields); + bedBlocks.push_back(currBedBlock); + } + } +} + + +/*********************************************** +Sorting comparison functions +************************************************/ +bool sortByChrom(BED const &a, BED const &b) { + if (a.chrom < b.chrom) return true; + else return false; +}; + +bool sortByStart(const BED &a, const BED &b) { + if (a.start < b.start) return true; + else return false; +}; + +bool sortBySizeAsc(const BED &a, const BED &b) { + + CHRPOS aLen = a.end - a.start; + CHRPOS bLen = b.end - b.start; + + if (aLen < bLen) return true; + else return false; +}; + +bool sortBySizeDesc(const BED &a, const BED &b) { + + CHRPOS aLen = a.end - a.start; + CHRPOS bLen = b.end - b.start; + + if (aLen > bLen) return true; + else return false; +}; + +bool sortByScoreAsc(const BED &a, const BED &b) { + if (a.score < b.score) return true; + else return false; +}; + +bool sortByScoreDesc(const BED &a, const BED &b) { + if (a.score > b.score) return true; + else return false; +}; + +bool byChromThenStart(BED const &a, BED const &b) { + + if (a.chrom < b.chrom) return true; + else if (a.chrom > b.chrom) return false; + + if (a.start < b.start) return true; + else if (a.start >= b.start) return false; + + return false; +}; + + +/******************************************* +Class methods +*******************************************/ + +// Constructor +BedFile::BedFile(string &bedFile) +: bedFile(bedFile), + _isGff(false), + _isVcf(false), + _typeIsKnown(false), + _merged_start(-1), + _merged_end(-1), + _merged_chrom(""), + _prev_start(-1), + _prev_chrom("") +{} + +// Destructor +BedFile::~BedFile(void) { +} + + +void BedFile::Open(void) { + + _bedFields.reserve(12); + + if (bedFile == "stdin" || bedFile == "-") { + _bedStream = &cin; + } + else { + _bedStream = new ifstream(bedFile.c_str(), ios::in); + + if( isGzipFile(_bedStream) ) { + delete _bedStream; + _bedStream = new igzstream(bedFile.c_str(), ios::in); + } + if ( !(_bedStream->good()) ) { + cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + } +} + +// Rewind the pointer back to the beginning of the file +void BedFile::Rewind(void) { + _bedStream->seekg(0, ios::beg); +} + +// Jump to a specific byte in the file +void BedFile::Seek(unsigned long offset) { + _bedStream->seekg(offset); +} + +// Jump to a specific byte in the file +bool BedFile::Empty() { + return _bedStream->eof(); +} + + +// Close the BED file +void BedFile::Close(void) { + if (bedFile != "stdin" && bedFile != "-") delete _bedStream; +} + + +BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum, bool forceSorted) { + + // make sure there are still lines to process. + // if so, tokenize, validate and return the BED entry. + _bedFields.clear(); + // clear out the previous bed's data + if (_bedStream->good()) { + // parse the bedStream pointer + getline(*_bedStream, _bedLine); + lineNum++; + + // split into a string vector. + Tokenize(_bedLine, _bedFields); + + // load the BED struct as long as it's a valid BED entry. + BedLineStatus status = parseLine(bed, _bedFields, lineNum); + if (!forceSorted) { + return status; + } + else if (status == BED_VALID) { + if (bed.chrom == _prev_chrom) { + if ((int) bed.start >= _prev_start) { + _prev_chrom = bed.chrom; + _prev_start = bed.start; + return status; + } + else { + cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl; + exit(1); + } + } + else if (bed.chrom > _prev_chrom) { + _prev_chrom = bed.chrom; + _prev_start = bed.start; + return status; + } + else if (bed.chrom < _prev_chrom) { + cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl; + exit(1); + } + } + } + + // default if file is closed or EOF + return BED_INVALID; +} + + +bool BedFile::GetNextMergedBed(BED &merged_bed, int &lineNum) { + + if (_bedStream->good()) { + BED bed; + BedLineStatus bedStatus; + // force sorting; hence third param = true + while ((bedStatus = GetNextBed(bed, lineNum, true)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + if (((int) bed.start - _merged_end > 0) || + (_merged_end < 0) || + (bed.chrom != _merged_chrom)) + { + if (_merged_start >= 0) { + merged_bed.chrom = _merged_chrom; + merged_bed.start = _merged_start; + merged_bed.end = _merged_end; + + _merged_chrom = bed.chrom; + _merged_start = bed.start; + _merged_end = bed.end; + + return true; + } + else { + _merged_start = bed.start; + _merged_chrom = bed.chrom; + _merged_end = bed.end; + } + } + else if ((int) bed.end > _merged_end) + { + _merged_end = bed.end; + } + } + } + // handle the last merged block in the file. + if (bedStatus == BED_INVALID) + { + merged_bed.chrom = _merged_chrom; + merged_bed.start = _merged_start; + merged_bed.end = _merged_end; + return true; + } + } + return false; +} + + +void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, + string strand, vector<BED> &hits, bool sameStrand, bool diffStrand) { + + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); + vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); + + for (; bedItr != bedEnd; ++bedItr) { + // do we have sufficient overlap? + if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { + + bool strands_are_same = (strand == bedItr->strand); + + // test for necessary strandedness + if ( (sameStrand == false && diffStrand == false) + || + (sameStrand == true && strands_are_same == true) + || + (diffStrand == true && strands_are_same == false) + ) + { + hits.push_back(*bedItr); + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} + + +bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool sameStrand, bool diffStrand, float overlapFraction) { + + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + CHRPOS aLength = (end - start); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); + vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); + // the number of overlapping bases b/w a and b + int overlapBases = (e - s); + + // do we have sufficient overlap? + if ( (float) overlapBases / (float) aLength >= overlapFraction) { + + bool strands_are_same = (strand == bedItr->strand); + // test for necessary strandedness + if ( (sameStrand == false && diffStrand == false) + || + (sameStrand == true && strands_are_same == true) + || + (diffStrand == true && strands_are_same == false) + ) + { + return true; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } + return false; +} + + +bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool sameStrand, bool diffStrand, float overlapFraction) { + + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + CHRPOS aLength = (end - start); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); + vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); + + // the number of overlapping bases b/w a and b + int overlapBases = (e - s); + + // do we have sufficient overlap? + if ( (float) overlapBases / (float) aLength >= overlapFraction) { + CHRPOS bLength = (bedItr->end - bedItr->start); + float bOverlap = ( (float) overlapBases / (float) bLength ); + bool strands_are_same = (strand == bedItr->strand); + + // test for sufficient reciprocal overlap and strandedness + if ( (bOverlap >= overlapFraction) && + ((sameStrand == false && diffStrand == false) + || + (sameStrand == true && strands_are_same == true) + || + (diffStrand == true && strands_are_same == false)) + ) + { + return true; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } + return false; +} + + +void BedFile::countHits(const BED &a, bool sameStrand, bool diffStrand, bool countsOnly) { + + BIN startBin, endBin; + startBin = (a.start >> _binFirstShift); + endBin = ((a.end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin(); + vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + bool strands_are_same = (a.strand == bedItr->strand); + // skip the hit if not on the same strand (and we care) + if ((sameStrand == true && strands_are_same == false) || + (diffStrand == true && strands_are_same == true) + ) + { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { + + bedItr->count++; + if (countsOnly == false) { + if (a.zeroLength == false) { + bedItr->depthMap[a.start+1].starts++; + bedItr->depthMap[a.end].ends++; + } + else { + // correct for the fact that we artificially expanded the zeroLength feature + bedItr->depthMap[a.start+2].starts++; + bedItr->depthMap[a.end-1].ends++; + } + + if (a.start < bedItr->minOverlapStart) { + bedItr->minOverlapStart = a.start; + } + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} + + +void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool sameStrand, bool diffStrand, bool countsOnly) { + + // set to track the distinct B features that had coverage. + // we'll update the counts of coverage for these features by one + // at the end of this function to avoid over-counting. + set< vector<BEDCOV>::iterator > validHits; + + vector<BED>::const_iterator blockItr = bedBlocks.begin(); + vector<BED>::const_iterator blockEnd = bedBlocks.end(); + for (; blockItr != blockEnd; ++blockItr) { + + BIN startBin, endBin; + startBin = (blockItr->start >> _binFirstShift); + endBin = ((blockItr->end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BEDCOV>::iterator bedItr = bedCovMap[blockItr->chrom][j].begin(); + vector<BEDCOV>::iterator bedEnd = bedCovMap[blockItr->chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + bool strands_are_same = (blockItr->strand == bedItr->strand); + // skip the hit if not on the same strand (and we care) + if ((sameStrand == true && strands_are_same == false) || + (diffStrand == true && strands_are_same == true) + ) + { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, blockItr->start, blockItr->end) > 0) { + if (countsOnly == false) { + if (blockItr->zeroLength == false) { + bedItr->depthMap[blockItr->start+1].starts++; + bedItr->depthMap[blockItr->end].ends++; + } + else { + // correct for the fact that we artificially expanded the zeroLength feature + bedItr->depthMap[blockItr->start+2].starts++; + bedItr->depthMap[blockItr->end-1].ends++; + } + } + + validHits.insert(bedItr); + if (blockItr->start < bedItr->minOverlapStart) + bedItr->minOverlapStart = blockItr->start; + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } + } + // incrment the count of overlap by one for each B feature that overlapped + // the current passed hit. This is necessary to prevent over-counting for + // each "split"" of a single read. + set< vector<BEDCOV>::iterator >::iterator validHitsItr = validHits.begin(); + set< vector<BEDCOV>::iterator >::iterator validHitsEnd = validHits.end(); + for (; validHitsItr != validHitsEnd; ++validHitsItr) + // the validHitsItr points to another itr, hence the (*itr)-> dereferencing. + // ugly, but that's C++. + (*validHitsItr)->count++; +} + + +void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) { + + BIN startBin, endBin; + startBin = (a.start >> _binFirstShift); + endBin = ((a.end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BEDCOVLIST>::iterator bedItr = bedCovListMap[a.chrom][j].begin(); + vector<BEDCOVLIST>::iterator bedEnd = bedCovListMap[a.chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + bool strands_are_same = (a.strand == bedItr->strand); + // skip the hit if not on the same strand (and we care) + if ((sameStrand == true && strands_are_same == false) || + (diffStrand == true && strands_are_same == true) + ) + { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { + bedItr->counts[index]++; + if (a.zeroLength == false) { + bedItr->depthMapList[index][a.start+1].starts++; + bedItr->depthMapList[index][a.end].ends++; + } + else { + // correct for the fact that we artificially expanded the zeroLength feature + bedItr->depthMapList[index][a.start+2].starts++; + bedItr->depthMapList[index][a.end-1].ends++; + } + + if (a.start < bedItr->minOverlapStarts[index]) { + bedItr->minOverlapStarts[index] = a.start; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} + +void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; } + +void BedFile::setGff (bool gff) { this->_isGff = gff; } + + +void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; } + + +void BedFile::setFileType (FileType type) { + _fileType = type; + _typeIsKnown = true; +} + + +void BedFile::setBedType (int colNums) { + bedType = colNums; +} + + +void BedFile::loadBedFileIntoMap() { + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); + bedMap[bedEntry.chrom][bin].push_back(bedEntry); + bedEntry = nullBed; + } + } + Close(); +} + + +void BedFile::loadBedCovFileIntoMap() { + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); + + BEDCOV bedCov; + bedCov.chrom = bedEntry.chrom; + bedCov.start = bedEntry.start; + bedCov.end = bedEntry.end; + bedCov.name = bedEntry.name; + bedCov.score = bedEntry.score; + bedCov.strand = bedEntry.strand; + bedCov.otherFields = bedEntry.otherFields; + bedCov.zeroLength = bedEntry.zeroLength; + bedCov.count = 0; + bedCov.minOverlapStart = INT_MAX; + + bedCovMap[bedEntry.chrom][bin].push_back(bedCov); + bedEntry = nullBed; + } + } + Close(); +} + +void BedFile::loadBedCovListFileIntoMap() { + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BIN bin = getBin(bedEntry.start, bedEntry.end); + + BEDCOVLIST bedCovList; + bedCovList.chrom = bedEntry.chrom; + bedCovList.start = bedEntry.start; + bedCovList.end = bedEntry.end; + bedCovList.name = bedEntry.name; + bedCovList.score = bedEntry.score; + bedCovList.strand = bedEntry.strand; + bedCovList.otherFields = bedEntry.otherFields; + bedCovList.zeroLength = bedEntry.zeroLength; + + bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList); + bedEntry = nullBed; + } + } + Close(); +} + + +void BedFile::loadBedFileIntoMapNoBin() { + + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + Open(); + while ((bedStatus = this->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + bedMapNoBin[bedEntry.chrom].push_back(bedEntry); + bedEntry = nullBed; + } + } + Close(); + + // sort the BED entries for each chromosome + // in ascending order of start position + for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin(); m != this->bedMapNoBin.end(); ++m) { + sort(m->second.begin(), m->second.end(), sortByStart); + } +}