0
|
1 /*****************************************************************************
|
|
2 maskFastaFromBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "maskFastaFromBed.h"
|
|
14
|
|
15
|
|
16 MaskFastaFromBed::MaskFastaFromBed(const string &fastaInFile, const string &bedFile,
|
|
17 const string &fastaOutFile, bool softMask, char maskChar) {
|
|
18 _softMask = softMask;
|
|
19 _fastaInFile = fastaInFile;
|
|
20 _bedFile = bedFile;
|
|
21 _fastaOutFile = fastaOutFile;
|
|
22 _maskChar = maskChar;
|
|
23 _bed = new BedFile(_bedFile);
|
|
24
|
|
25 _bed->loadBedFileIntoMapNoBin();
|
|
26 // start masking.
|
|
27 MaskFasta();
|
|
28 }
|
|
29
|
|
30
|
|
31 MaskFastaFromBed::~MaskFastaFromBed(void) {
|
|
32 }
|
|
33
|
|
34
|
|
35 //******************************************************************************
|
|
36 // Mask the Fasta file based on the coordinates in the BED file.
|
|
37 //******************************************************************************
|
|
38 void MaskFastaFromBed::MaskFasta() {
|
|
39
|
|
40 /* Make sure that we can open all of the files successfully*/
|
|
41
|
|
42 // open the fasta database for reading
|
|
43 ifstream fa(_fastaInFile.c_str(), ios::in);
|
|
44 if ( !fa ) {
|
|
45 cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl;
|
|
46 exit (1);
|
|
47 }
|
|
48
|
|
49 // open the fasta database for reading
|
|
50 ofstream faOut(_fastaOutFile.c_str(), ios::out);
|
|
51 if ( !faOut ) {
|
|
52 cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl;
|
|
53 exit (1);
|
|
54 }
|
|
55
|
|
56
|
|
57 /* Read the fastaDb chromosome by chromosome*/
|
|
58 string fastaInLine;
|
|
59 string currChrom;
|
|
60 string currDNA = "";
|
|
61 currDNA.reserve(500000000);
|
|
62 int fastaWidth = -1;
|
|
63 bool widthSet = false;
|
|
64 int start, end, length;
|
|
65 string replacement;
|
|
66
|
|
67 while (getline(fa,fastaInLine)) {
|
|
68
|
|
69 if (fastaInLine.find(">",0) != 0 ) {
|
|
70 if (widthSet == false) {
|
|
71 fastaWidth = fastaInLine.size();
|
|
72 widthSet = true;
|
|
73 }
|
|
74 currDNA += fastaInLine;
|
|
75 }
|
|
76 else {
|
|
77 if (currDNA.size() > 0) {
|
|
78
|
|
79 vector<BED> bedList = _bed->bedMapNoBin[currChrom];
|
|
80
|
|
81 /*
|
|
82 loop through each BED entry for this chrom and
|
|
83 mask the requested sequence in the FASTA file.
|
|
84 */
|
|
85 for (unsigned int i = 0; i < bedList.size(); i++) {
|
|
86 start = bedList[i].start;
|
|
87 end = bedList[i].end;
|
|
88 length = end - start;
|
|
89
|
|
90 /*
|
|
91 (1) if soft masking, extract the sequence, lowercase it,
|
|
92 then put it back
|
|
93 (2) otherwise replace with Ns
|
|
94 */
|
|
95 if (_softMask) {
|
|
96 replacement = currDNA.substr(start, length);
|
|
97 toLowerCase(replacement);
|
|
98 currDNA.replace(start, length, replacement);
|
|
99 }
|
|
100 else {
|
|
101 string hardmask(length, _maskChar);
|
|
102 currDNA.replace(start, length, hardmask);
|
|
103 }
|
|
104 }
|
|
105 // write the masked chrom to the output file
|
|
106 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth);
|
|
107 }
|
|
108
|
|
109 // reset for the next chromosome.
|
|
110 currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1);
|
|
111 currDNA = "";
|
|
112 }
|
|
113 }
|
|
114
|
|
115 // process the last chromosome.
|
|
116 // exact same logic as in the main loop.
|
|
117 if (currDNA.size() > 0) {
|
|
118
|
|
119 vector<BED> bedList = _bed->bedMapNoBin[currChrom];
|
|
120
|
|
121 for (unsigned int i = 0; i < bedList.size(); i++) {
|
|
122 start = bedList[i].start;
|
|
123 end = bedList[i].end;
|
|
124 length = end - start;
|
|
125
|
|
126 if (_softMask) {
|
|
127 replacement = currDNA.substr(start, length);
|
|
128 toLowerCase(replacement);
|
|
129 currDNA.replace(start, length, replacement);
|
|
130 }
|
|
131 else {
|
|
132 string hardmask(length, _maskChar);
|
|
133 currDNA.replace(start, length, hardmask);
|
|
134 }
|
|
135 }
|
|
136 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth);
|
|
137 }
|
|
138
|
|
139 // closed for business.
|
|
140 fa.close();
|
|
141 faOut.close();
|
|
142 }
|
|
143
|
|
144
|
|
145 void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) {
|
|
146
|
|
147 int seqLength = sequence.size();
|
|
148
|
|
149 out << ">" << chrom << endl;
|
|
150 for(int i = 0; i < seqLength; i += width) {
|
|
151 if (i + width < seqLength) out << sequence.substr(i, width) << endl;
|
|
152 else out << sequence.substr(i, seqLength-i) << endl;
|
|
153 }
|
|
154 }
|
|
155
|