annotate BEDTools-Version-2.14.3/src/maskFastaFromBed/maskFastaFromBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
1 /*****************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
2 maskFastaFromBed.cpp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
3
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
4 (c) 2009 - Aaron Quinlan
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
5 Hall Laboratory
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
6 Department of Biochemistry and Molecular Genetics
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
7 University of Virginia
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
8 aaronquinlan@gmail.com
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
9
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
10 Licenced under the GNU General Public License 2.0 license.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
11 ******************************************************************************/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
12 #include "lineFileUtilities.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
13 #include "maskFastaFromBed.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
14
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
15
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
16 MaskFastaFromBed::MaskFastaFromBed(const string &fastaInFile, const string &bedFile,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
17 const string &fastaOutFile, bool softMask, char maskChar) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
18 _softMask = softMask;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
19 _fastaInFile = fastaInFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
20 _bedFile = bedFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
21 _fastaOutFile = fastaOutFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
22 _maskChar = maskChar;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
23 _bed = new BedFile(_bedFile);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
24
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
25 _bed->loadBedFileIntoMapNoBin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
26 // start masking.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
27 MaskFasta();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
28 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
29
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
30
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
31 MaskFastaFromBed::~MaskFastaFromBed(void) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
32 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
33
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
34
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
35 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
36 // Mask the Fasta file based on the coordinates in the BED file.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
37 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
38 void MaskFastaFromBed::MaskFasta() {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
39
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
40 /* Make sure that we can open all of the files successfully*/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
41
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
42 // open the fasta database for reading
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
43 ifstream fa(_fastaInFile.c_str(), ios::in);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
44 if ( !fa ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
45 cerr << "Error: The requested fasta file (" << _fastaInFile << ") could not be opened. Exiting!" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
46 exit (1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
47 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
48
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
49 // open the fasta database for reading
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
50 ofstream faOut(_fastaOutFile.c_str(), ios::out);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
51 if ( !faOut ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
52 cerr << "Error: The requested fasta output file (" << _fastaOutFile << ") could not be opened. Exiting!" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
53 exit (1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
54 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
55
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
56
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
57 /* Read the fastaDb chromosome by chromosome*/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
58 string fastaInLine;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
59 string currChrom;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
60 string currDNA = "";
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
61 currDNA.reserve(500000000);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
62 int fastaWidth = -1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
63 bool widthSet = false;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
64 int start, end, length;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
65 string replacement;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
66
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
67 while (getline(fa,fastaInLine)) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
68
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
69 if (fastaInLine.find(">",0) != 0 ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
70 if (widthSet == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
71 fastaWidth = fastaInLine.size();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
72 widthSet = true;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
73 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
74 currDNA += fastaInLine;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
75 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
76 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
77 if (currDNA.size() > 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
78
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
79 vector<BED> bedList = _bed->bedMapNoBin[currChrom];
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
80
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
81 /*
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
82 loop through each BED entry for this chrom and
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
83 mask the requested sequence in the FASTA file.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
84 */
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
85 for (unsigned int i = 0; i < bedList.size(); i++) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
86 start = bedList[i].start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
87 end = bedList[i].end;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
88 length = end - start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
89
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
90 /*
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
91 (1) if soft masking, extract the sequence, lowercase it,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
92 then put it back
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
93 (2) otherwise replace with Ns
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
94 */
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
95 if (_softMask) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
96 replacement = currDNA.substr(start, length);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
97 toLowerCase(replacement);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
98 currDNA.replace(start, length, replacement);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
99 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
100 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
101 string hardmask(length, _maskChar);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
102 currDNA.replace(start, length, hardmask);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
103 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
104 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
105 // write the masked chrom to the output file
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
106 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
107 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
108
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
109 // reset for the next chromosome.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
110 currChrom = fastaInLine.substr(1, fastaInLine.find_first_of(" ")-1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
111 currDNA = "";
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
112 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
113 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
114
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
115 // process the last chromosome.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
116 // exact same logic as in the main loop.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
117 if (currDNA.size() > 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
118
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
119 vector<BED> bedList = _bed->bedMapNoBin[currChrom];
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
120
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
121 for (unsigned int i = 0; i < bedList.size(); i++) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
122 start = bedList[i].start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
123 end = bedList[i].end;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
124 length = end - start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
125
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
126 if (_softMask) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
127 replacement = currDNA.substr(start, length);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
128 toLowerCase(replacement);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
129 currDNA.replace(start, length, replacement);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
130 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
131 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
132 string hardmask(length, _maskChar);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
133 currDNA.replace(start, length, hardmask);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
134 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
135 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
136 PrettyPrintChrom(faOut, currChrom, currDNA, fastaWidth);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
137 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
138
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
139 // closed for business.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
140 fa.close();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
141 faOut.close();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
142 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
143
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
144
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
145 void MaskFastaFromBed::PrettyPrintChrom(ofstream &out, string chrom, const string &sequence, int width) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
146
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
147 int seqLength = sequence.size();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
148
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
149 out << ">" << chrom << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
150 for(int i = 0; i < seqLength; i += width) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
151 if (i + width < seqLength) out << sequence.substr(i, width) << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
152 else out << sequence.substr(i, seqLength-i) << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
153 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
154 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
155