Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/maskFastaFromBed/maskFastaFromBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 maskFastaFromBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "maskFastaFromBed.h" | |
14 | |
15 | |
16 MaskFastaFromBed::MaskFastaFromBed(const string &fastaInFile, const string &bedFile, | |
17 const string &fastaOutFile, bool softMask, char maskChar) { | |
18 _softMask = softMask; | |
19 _fastaInFile = fastaInFile; | |
20 _bedFile = bedFile; | |
21 _fastaOutFile = fastaOutFile; | |
22 _maskChar = maskChar; | |
23 _bed = new BedFile(_bedFile); | |
24 | |
25 _bed->loadBedFileIntoMapNoBin(); | |
26 // start masking. | |
27 MaskFasta(); | |
28 } | |
29 | |
30 | |
31 MaskFastaFromBed::~MaskFastaFromBed(void) { | |
32 } | |
33 | |
34 | |
35 //****************************************************************************** | |
36 // Mask the Fasta file based on the coordinates in the BED file. | |
37 //****************************************************************************** | |
38 void MaskFastaFromBed::MaskFasta() { | |
39 | |
40 /* Make sure that we can open all of the files successfully*/ | |
41 | |
42 // open the fasta database for reading | |
43 ifstream fa(_fastaInFile.c_str(), ios::in); | |
44 if ( !fa ) { | |
45 cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl; | |
46 exit (1); | |
47 } | |
48 | |
49 // open the fasta database for reading | |
50 ofstream faOut(_fastaOutFile.c_str(), ios::out); | |
51 if ( !faOut ) { | |
52 cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl; | |
53 exit (1); | |
54 } | |
55 | |
56 | |
57 /* Read the fastaDb chromosome by chromosome*/ | |
58 string fastaInLine; | |
59 string currChrom; | |
60 string currDNA = ""; | |
61 currDNA.reserve(500000000); | |
62 int fastaWidth = -1; | |
63 bool widthSet = false; | |
64 int start, end, length; | |
65 string replacement; | |
66 | |
67 while (getline(fa,fastaInLine)) { | |
68 | |
69 if (fastaInLine.find(">",0) != 0 ) { | |
70 if (widthSet == false) { | |
71 fastaWidth = fastaInLine.size(); | |
72 widthSet = true; | |
73 } | |
74 currDNA += fastaInLine; | |
75 } | |
76 else { | |
77 if (currDNA.size() > 0) { | |
78 | |
79 vector<BED> bedList = _bed->bedMapNoBin[currChrom]; | |
80 | |
81 /* | |
82 loop through each BED entry for this chrom and | |
83 mask the requested sequence in the FASTA file. | |
84 */ | |
85 for (unsigned int i = 0; i < bedList.size(); i++) { | |
86 start = bedList[i].start; | |
87 end = bedList[i].end; | |
88 length = end - start; | |
89 | |
90 /* | |
91 (1) if soft masking, extract the sequence, lowercase it, | |
92 then put it back | |
93 (2) otherwise replace with Ns | |
94 */ | |
95 if (_softMask) { | |
96 replacement = currDNA.substr(start, length); | |
97 toLowerCase(replacement); | |
98 currDNA.replace(start, length, replacement); | |
99 } | |
100 else { | |
101 string hardmask(length, _maskChar); | |
102 currDNA.replace(start, length, hardmask); | |
103 } | |
104 } | |
105 // write the masked chrom to the output file | |
106 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); | |
107 } | |
108 | |
109 // reset for the next chromosome. | |
110 currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1); | |
111 currDNA = ""; | |
112 } | |
113 } | |
114 | |
115 // process the last chromosome. | |
116 // exact same logic as in the main loop. | |
117 if (currDNA.size() > 0) { | |
118 | |
119 vector<BED> bedList = _bed->bedMapNoBin[currChrom]; | |
120 | |
121 for (unsigned int i = 0; i < bedList.size(); i++) { | |
122 start = bedList[i].start; | |
123 end = bedList[i].end; | |
124 length = end - start; | |
125 | |
126 if (_softMask) { | |
127 replacement = currDNA.substr(start, length); | |
128 toLowerCase(replacement); | |
129 currDNA.replace(start, length, replacement); | |
130 } | |
131 else { | |
132 string hardmask(length, _maskChar); | |
133 currDNA.replace(start, length, hardmask); | |
134 } | |
135 } | |
136 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth); | |
137 } | |
138 | |
139 // closed for business. | |
140 fa.close(); | |
141 faOut.close(); | |
142 } | |
143 | |
144 | |
145 void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) { | |
146 | |
147 int seqLength = sequence.size(); | |
148 | |
149 out << ">" << chrom << endl; | |
150 for(int i = 0; i < seqLength; i += width) { | |
151 if (i + width < seqLength) out << sequence.substr(i, width) << endl; | |
152 else out << sequence.substr(i, seqLength-i) << endl; | |
153 } | |
154 } | |
155 |