comparison BEDTools-Version-2.14.3/src/maskFastaFromBed/maskFastaFromBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 maskFastaFromBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "maskFastaFromBed.h"
14
15
16 MaskFastaFromBed::MaskFastaFromBed(const string &fastaInFile, const string &bedFile,
17 const string &fastaOutFile, bool softMask, char maskChar) {
18 _softMask = softMask;
19 _fastaInFile = fastaInFile;
20 _bedFile = bedFile;
21 _fastaOutFile = fastaOutFile;
22 _maskChar = maskChar;
23 _bed = new BedFile(_bedFile);
24
25 _bed->loadBedFileIntoMapNoBin();
26 // start masking.
27 MaskFasta();
28 }
29
30
31 MaskFastaFromBed::~MaskFastaFromBed(void) {
32 }
33
34
35 //******************************************************************************
36 // Mask the Fasta file based on the coordinates in the BED file.
37 //******************************************************************************
38 void MaskFastaFromBed::MaskFasta() {
39
40 /* Make sure that we can open all of the files successfully*/
41
42 // open the fasta database for reading
43 ifstream fa(_fastaInFile.c_str(), ios::in);
44 if ( !fa ) {
45 cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl;
46 exit (1);
47 }
48
49 // open the fasta database for reading
50 ofstream faOut(_fastaOutFile.c_str(), ios::out);
51 if ( !faOut ) {
52 cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl;
53 exit (1);
54 }
55
56
57 /* Read the fastaDb chromosome by chromosome*/
58 string fastaInLine;
59 string currChrom;
60 string currDNA = "";
61 currDNA.reserve(500000000);
62 int fastaWidth = -1;
63 bool widthSet = false;
64 int start, end, length;
65 string replacement;
66
67 while (getline(fa,fastaInLine)) {
68
69 if (fastaInLine.find(">",0) != 0 ) {
70 if (widthSet == false) {
71 fastaWidth = fastaInLine.size();
72 widthSet = true;
73 }
74 currDNA += fastaInLine;
75 }
76 else {
77 if (currDNA.size() > 0) {
78
79 vector<BED> bedList = _bed->bedMapNoBin[currChrom];
80
81 /*
82 loop through each BED entry for this chrom and
83 mask the requested sequence in the FASTA file.
84 */
85 for (unsigned int i = 0; i < bedList.size(); i++) {
86 start = bedList[i].start;
87 end = bedList[i].end;
88 length = end - start;
89
90 /*
91 (1) if soft masking, extract the sequence, lowercase it,
92 then put it back
93 (2) otherwise replace with Ns
94 */
95 if (_softMask) {
96 replacement = currDNA.substr(start, length);
97 toLowerCase(replacement);
98 currDNA.replace(start, length, replacement);
99 }
100 else {
101 string hardmask(length, _maskChar);
102 currDNA.replace(start, length, hardmask);
103 }
104 }
105 // write the masked chrom to the output file
106 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth);
107 }
108
109 // reset for the next chromosome.
110 currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1);
111 currDNA = "";
112 }
113 }
114
115 // process the last chromosome.
116 // exact same logic as in the main loop.
117 if (currDNA.size() > 0) {
118
119 vector<BED> bedList = _bed->bedMapNoBin[currChrom];
120
121 for (unsigned int i = 0; i < bedList.size(); i++) {
122 start = bedList[i].start;
123 end = bedList[i].end;
124 length = end - start;
125
126 if (_softMask) {
127 replacement = currDNA.substr(start, length);
128 toLowerCase(replacement);
129 currDNA.replace(start, length, replacement);
130 }
131 else {
132 string hardmask(length, _maskChar);
133 currDNA.replace(start, length, hardmask);
134 }
135 }
136 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth);
137 }
138
139 // closed for business.
140 fa.close();
141 faOut.close();
142 }
143
144
145 void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) {
146
147 int seqLength = sequence.size();
148
149 out << ">" << chrom << endl;
150 for(int i = 0; i < seqLength; i += width) {
151 if (i + width < seqLength) out << sequence.substr(i, width) << endl;
152 else out << sequence.substr(i, seqLength-i) << endl;
153 }
154 }
155