Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/utils/bedFilePE/bedFilePE.cpp @ 1:bec36315bd12 default tip
Deleted selected files
author | aaronquinlan |
---|---|
date | Sat, 19 Nov 2011 14:17:03 -0500 |
parents | dfcd8b6c1bda |
children |
line wrap: on
line diff
--- a/BEDTools-Version-2.14.3/src/utils/bedFilePE/bedFilePE.cpp Thu Nov 03 10:25:04 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,530 +0,0 @@ -// -// bedFilePE.cpp -// BEDTools -// -// Created by Aaron Quinlan Spring 2009. -// Copyright 2009 Aaron Quinlan. All rights reserved. -// -// Summary: Contains common functions for finding BED overlaps. -// -// Acknowledgments: Much of the code herein is taken from Jim Kent's -// BED processing code. I am grateful for his elegant -// genome binning algorithm and therefore use it extensively. - - -#include "bedFilePE.h" - - -// Constructor -BedFilePE::BedFilePE(string &bedFile) { - this->bedFile = bedFile; -} - -// Destructor -BedFilePE::~BedFilePE(void) { -} - -void BedFilePE::Open(void) { - if (bedFile == "stdin" || bedFile == "-") { - _bedStream = &cin; - } - else { - _bedStream = new ifstream(bedFile.c_str(), ios::in); - - if (isGzipFile(_bedStream) == true) { - delete _bedStream; - _bedStream = new igzstream(bedFile.c_str(), ios::in); - } - // can we open the file? - if ( !(_bedStream->good()) ) { - cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - } -} - - - -// Close the BEDPE file -void BedFilePE::Close(void) { - if (bedFile != "stdin" && bedFile != "-") delete _bedStream; -} - - -BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) { - - // make sure there are still lines to process. - // if so, tokenize, validate and return the BEDPE entry. - if (_bedStream->good()) { - string bedPELine; - vector<string> bedPEFields; - bedPEFields.reserve(10); - - // parse the bedStream pointer - getline(*_bedStream, bedPELine); - lineNum++; - - // split into a string vector. - Tokenize(bedPELine,bedPEFields); - - // load the BEDPE struct as long as it's a valid BEDPE entry. - return parseLine(bedpe, bedPEFields, lineNum); - } - // default if file is closed or EOF - return BED_INVALID; -} - - -/* - reportBedPETab - - Writes the _original_ BED entry for A. - Works for BEDPE only. -*/ -void BedFilePE::reportBedPETab(const BEDPE &a) { - - if (this->bedType == 6) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2); - } - else if (this->bedType == 7) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str()); - } - else if (this->bedType == 8) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str()); - } - else if (this->bedType == 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - } - else if (this->bedType > 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - - vector<string>::const_iterator othIt = a.otherFields.begin(); - vector<string>::const_iterator othEnd = a.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\t"); - } -} - - - -/* - reportBedPENewLine - - Writes the _original_ BED entry for A. - Works for BEDPE only. -*/ -void BedFilePE::reportBedPENewLine(const BEDPE &a) { - - if (this->bedType == 6) { - printf("%s\t%d\t%d\t%s\t%d\t%d\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2); - } - else if (this->bedType == 7) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str()); - } - else if (this->bedType == 8) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str()); - } - else if (this->bedType == 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - } - else if (this->bedType > 10) { - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, - a.chrom2.c_str(), a.start2, a.end2, - a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - - vector<string>::const_iterator othIt = a.otherFields.begin(); - vector<string>::const_iterator othEnd = a.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\n"); - } -} - - -BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) { - - // bail out if we have a blank line - if (lineVector.empty()) - return BED_BLANK; - - if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { - // we need at least 6 columns - if (lineVector.size() >= 6) { - if (parseBedPELine(bedpe, lineVector, lineNum) == true) - return BED_VALID; - else return BED_INVALID; - } - else { - cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl; - exit(1); - } - } - else { - lineNum--; - return BED_HEADER; - } - - // default - return BED_INVALID; -} - - -bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) { - - if ((lineNum == 1) && (lineVector.size() >= 6)) { - - this->bedType = lineVector.size(); - - if (this->bedType == 6) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - return true; - } - else if (this->bedType == 7) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - return true; - } - else if (this->bedType == 8) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - return true; - } - else if (this->bedType == 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - return true; - } - else if (this->bedType > 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - for (unsigned int i = 10; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - return true; - } - else { - cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start1 > bed.end1) { - cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; - return false; - } - else if (bed.start2 > bed.end2) { - cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; - return false; - } - else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { - cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; - return false; - } - } - else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { - - if (this->bedType == 6) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - return true; - } - else if (this->bedType == 7) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - return true; - } - else if (this->bedType == 8) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - return true; - } - else if (this->bedType == 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - return true; - } - else if (this->bedType > 10) { - bed.chrom1 = lineVector[0]; - bed.start1 = atoi(lineVector[1].c_str()); - bed.end1 = atoi(lineVector[2].c_str()); - - bed.chrom2 = lineVector[3]; - bed.start2 = atoi(lineVector[4].c_str()); - bed.end2 = atoi(lineVector[5].c_str()); - - bed.name = lineVector[6]; - bed.score = lineVector[7].c_str(); - - bed.strand1 = lineVector[8]; - bed.strand2 = lineVector[9]; - - for (unsigned int i = 10; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - return true; - } - else { - cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start1 > bed.end1) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; - return false; - } - else if (bed.start2 > bed.end2) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; - return false; - } - else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; - return false; - } - } - else if (lineVector.size() == 1) { - cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { - cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() < 6) && (lineVector.size() != 0)) { - cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; -} - - -/* - Adapted from kent source "binKeeperFind" -*/ -void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand, - vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames) { - - int startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { - - // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<MATE>::const_iterator bedItr; - vector<MATE>::const_iterator bedEnd; - if (bEnd == 1) { - bedItr = bedMapEnd1[chrom][j].begin(); - bedEnd = bedMapEnd1[chrom][j].end(); - } - else if (bEnd == 2) { - bedItr = bedMapEnd2[chrom][j].begin(); - bedEnd = bedMapEnd2[chrom][j].end(); - } - else { - cerr << "Unexpected end of B requested" << endl; - } - for (; bedItr != bedEnd; ++bedItr) { - float overlap = overlaps(bedItr->bed.start, bedItr->bed.end, start, end); - float size = end - start; - - if ( (overlap / size) >= overlapFraction ) { - - // skip the hit if not on the same strand (and we care) - if ((forceStrand == false) && (enforceDiffNames == false)) { - hits.push_back(*bedItr); // it's a hit, add it. - } - else if ((forceStrand == true) && (enforceDiffNames == false)) { - if (strand == bedItr->bed.strand) - hits.push_back(*bedItr); // it's a hit, add it. - } - else if ((forceStrand == true) && (enforceDiffNames == true)) { - if ((strand == bedItr->bed.strand) && (name != bedItr->bed.name)) - hits.push_back(*bedItr); // it's a hit, add it. - } - else if ((forceStrand == false) && (enforceDiffNames == true)) { - if (name != bedItr->bed.name) - hits.push_back(*bedItr); // it's a hit, add it. - } - } - - } - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } -} - - -void BedFilePE::loadBedPEFileIntoMap() { - - int lineNum = 0; - int bin1, bin2; - BedLineStatus bedStatus; - BEDPE bedpeEntry, nullBedPE; - - Open(); - bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); - while (bedStatus != BED_INVALID) { - - if (bedStatus == BED_VALID) { - MATE *bedEntry1 = new MATE(); - MATE *bedEntry2 = new MATE(); - // separate the BEDPE entry into separate - // BED entries - splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); - - // load end1 into a UCSC bin map - bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end); - this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1); - - // load end2 into a UCSC bin map - bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end); - this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2); - - bedpeEntry = nullBedPE; - } - bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); - } - Close(); -} - - -void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) { - - /* - Split the BEDPE entry into separate BED entries - - NOTE: I am using a trick here where I store - the lineNum of the BEDPE from the original file - in the "count" column. This allows me to later - resolve whether the hits found on both ends of BEDPE A - came from the same entry in BEDPE B. Tracking by "name" - alone with fail when there are multiple mappings for a given - read-pair. - */ - - bedEntry1->bed.chrom = bedpeEntry.chrom1; - bedEntry1->bed.start = bedpeEntry.start1; - bedEntry1->bed.end = bedpeEntry.end1; - bedEntry1->bed.name = bedpeEntry.name; - bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory - bedEntry1->bed.strand = bedpeEntry.strand1; - bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory - bedEntry1->lineNum = lineNum; - bedEntry1->mate = bedEntry2; // keep a pointer to end2 - - bedEntry2->bed.chrom = bedpeEntry.chrom2; - bedEntry2->bed.start = bedpeEntry.start2; - bedEntry2->bed.end = bedpeEntry.end2; - bedEntry2->bed.name = bedpeEntry.name; - bedEntry2->bed.strand = bedpeEntry.strand2; - bedEntry2->lineNum = lineNum; - bedEntry2->mate = bedEntry1; // keep a pointer to end1 -} - - -