Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/maskFastaFromBed/maskFastaFromBed.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 maskFastaFromBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "maskFastaFromBed.h" | |
| 14 | |
| 15 | |
| 16 MaskFastaFromBed::MaskFastaFromBed(const string &fastaInFile, const string &bedFile, | |
| 17 const string &fastaOutFile, bool softMask, char maskChar) { | |
| 18 _softMask = softMask; | |
| 19 _fastaInFile = fastaInFile; | |
| 20 _bedFile = bedFile; | |
| 21 _fastaOutFile = fastaOutFile; | |
| 22 _maskChar = maskChar; | |
| 23 _bed = new BedFile(_bedFile); | |
| 24 | |
| 25 _bed->loadBedFileIntoMapNoBin(); | |
| 26 // start masking. | |
| 27 MaskFasta(); | |
| 28 } | |
| 29 | |
| 30 | |
| 31 MaskFastaFromBed::~MaskFastaFromBed(void) { | |
| 32 } | |
| 33 | |
| 34 | |
| 35 //****************************************************************************** | |
| 36 // Mask the Fasta file based on the coordinates in the BED file. | |
| 37 //****************************************************************************** | |
| 38 void MaskFastaFromBed::MaskFasta() { | |
| 39 | |
| 40 /* Make sure that we can open all of the files successfully*/ | |
| 41 | |
| 42 // open the fasta database for reading | |
| 43 ifstream fa(_fastaInFile.c_str(), ios::in); | |
| 44 if ( !fa ) { | |
| 45 cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl; | |
| 46 exit (1); | |
| 47 } | |
| 48 | |
| 49 // open the fasta database for reading | |
| 50 ofstream faOut(_fastaOutFile.c_str(), ios::out); | |
| 51 if ( !faOut ) { | |
| 52 cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl; | |
| 53 exit (1); | |
| 54 } | |
| 55 | |
| 56 | |
| 57 /* Read the fastaDb chromosome by chromosome*/ | |
| 58 string fastaInLine; | |
| 59 string currChrom; | |
| 60 string currDNA = ""; | |
| 61 currDNA.reserve(500000000); | |
| 62 int fastaWidth = -1; | |
| 63 bool widthSet = false; | |
| 64 int start, end, length; | |
| 65 string replacement; | |
| 66 | |
| 67 while (getline(fa,fastaInLine)) { | |
| 68 | |
| 69 if (fastaInLine.find(">",0) != 0 ) { | |
| 70 if (widthSet == false) { | |
| 71 fastaWidth = fastaInLine.size(); | |
| 72 widthSet = true; | |
| 73 } | |
| 74 currDNA += fastaInLine; | |
| 75 } | |
| 76 else { | |
| 77 if (currDNA.size() > 0) { | |
| 78 | |
| 79 vector<BED> bedList = _bed->bedMapNoBin[currChrom]; | |
| 80 | |
| 81 /* | |
| 82 loop through each BED entry for this chrom and | |
| 83 mask the requested sequence in the FASTA file. | |
| 84 */ | |
| 85 for (unsigned int i = 0; i < bedList.size(); i++) { | |
| 86 start = bedList[i].start; | |
| 87 end = bedList[i].end; | |
| 88 length = end - start; | |
| 89 | |
| 90 /* | |
| 91 (1) if soft masking, extract the sequence, lowercase it, | |
| 92 then put it back | |
| 93 (2) otherwise replace with Ns | |
| 94 */ | |
| 95 if (_softMask) { | |
| 96 replacement = currDNA.substr(start, length); | |
| 97 toLowerCase(replacement); | |
| 98 currDNA.replace(start, length, replacement); | |
| 99 } | |
| 100 else { | |
| 101 string hardmask(length, _maskChar); | |
| 102 currDNA.replace(start, length, hardmask); | |
| 103 } | |
| 104 } | |
| 105 // write the masked chrom to the output file | |
| 106 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); | |
| 107 } | |
| 108 | |
| 109 // reset for the next chromosome. | |
| 110 currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1); | |
| 111 currDNA = ""; | |
| 112 } | |
| 113 } | |
| 114 | |
| 115 // process the last chromosome. | |
| 116 // exact same logic as in the main loop. | |
| 117 if (currDNA.size() > 0) { | |
| 118 | |
| 119 vector<BED> bedList = _bed->bedMapNoBin[currChrom]; | |
| 120 | |
| 121 for (unsigned int i = 0; i < bedList.size(); i++) { | |
| 122 start = bedList[i].start; | |
| 123 end = bedList[i].end; | |
| 124 length = end - start; | |
| 125 | |
| 126 if (_softMask) { | |
| 127 replacement = currDNA.substr(start, length); | |
| 128 toLowerCase(replacement); | |
| 129 currDNA.replace(start, length, replacement); | |
| 130 } | |
| 131 else { | |
| 132 string hardmask(length, _maskChar); | |
| 133 currDNA.replace(start, length, hardmask); | |
| 134 } | |
| 135 } | |
| 136 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); | |
| 137 } | |
| 138 | |
| 139 // closed for business. | |
| 140 fa.close(); | |
| 141 faOut.close(); | |
| 142 } | |
| 143 | |
| 144 | |
| 145 void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) { | |
| 146 | |
| 147 int seqLength = sequence.size(); | |
| 148 | |
| 149 out << ">" << chrom << endl; | |
| 150 for(int i = 0; i < seqLength; i += width) { | |
| 151 if (i + width < seqLength) out << sequence.substr(i, width) << endl; | |
| 152 else out << sequence.substr(i, seqLength-i) << endl; | |
| 153 } | |
| 154 } | |
| 155 |
