Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/maskFastaFromBed/maskFastaFromBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/maskFastaFromBed/maskFastaFromBed.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,155 @@ +/***************************************************************************** + maskFastaFromBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "maskFastaFromBed.h" + + +MaskFastaFromBed::MaskFastaFromBed(const string &fastaInFile, const string &bedFile, + const string &fastaOutFile, bool softMask, char maskChar) { + _softMask = softMask; + _fastaInFile = fastaInFile; + _bedFile = bedFile; + _fastaOutFile = fastaOutFile; + _maskChar = maskChar; + _bed = new BedFile(_bedFile); + + _bed->loadBedFileIntoMapNoBin(); + // start masking. + MaskFasta(); +} + + +MaskFastaFromBed::~MaskFastaFromBed(void) { +} + + +//****************************************************************************** +// Mask the Fasta file based on the coordinates in the BED file. +//****************************************************************************** +void MaskFastaFromBed::MaskFasta() { + + /* Make sure that we can open all of the files successfully*/ + + // open the fasta database for reading + ifstream fa(_fastaInFile.c_str(), ios::in); + if ( !fa ) { + cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + // open the fasta database for reading + ofstream faOut(_fastaOutFile.c_str(), ios::out); + if ( !faOut ) { + cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + + /* Read the fastaDb chromosome by chromosome*/ + string fastaInLine; + string currChrom; + string currDNA = ""; + currDNA.reserve(500000000); + int fastaWidth = -1; + bool widthSet = false; + int start, end, length; + string replacement; + + while (getline(fa,fastaInLine)) { + + if (fastaInLine.find(">",0) != 0 ) { + if (widthSet == false) { + fastaWidth = fastaInLine.size(); + widthSet = true; + } + currDNA += fastaInLine; + } + else { + if (currDNA.size() > 0) { + + vector<BED> bedList = _bed->bedMapNoBin[currChrom]; + + /* + loop through each BED entry for this chrom and + mask the requested sequence in the FASTA file. + */ + for (unsigned int i = 0; i < bedList.size(); i++) { + start = bedList[i].start; + end = bedList[i].end; + length = end - start; + + /* + (1) if soft masking, extract the sequence, lowercase it, + then put it back + (2) otherwise replace with Ns + */ + if (_softMask) { + replacement = currDNA.substr(start, length); + toLowerCase(replacement); + currDNA.replace(start, length, replacement); + } + else { + string hardmask(length, _maskChar); + currDNA.replace(start, length, hardmask); + } + } + // write the masked chrom to the output file + PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); + } + + // reset for the next chromosome. + currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1); + currDNA = ""; + } + } + + // process the last chromosome. + // exact same logic as in the main loop. + if (currDNA.size() > 0) { + + vector<BED> bedList = _bed->bedMapNoBin[currChrom]; + + for (unsigned int i = 0; i < bedList.size(); i++) { + start = bedList[i].start; + end = bedList[i].end; + length = end - start; + + if (_softMask) { + replacement = currDNA.substr(start, length); + toLowerCase(replacement); + currDNA.replace(start, length, replacement); + } + else { + string hardmask(length, _maskChar); + currDNA.replace(start, length, hardmask); + } + } + PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); + } + + // closed for business. + fa.close(); + faOut.close(); +} + + +void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) { + + int seqLength = sequence.size(); + + out << ">" << chrom << endl; + for(int i = 0; i < seqLength; i += width) { + if (i + width < seqLength) out << sequence.substr(i, width) << endl; + else out << sequence.substr(i, seqLength-i) << endl; + } +} +