0
|
1 /*****************************************************************************
|
|
2 fastaFromBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "fastaFromBed.h"
|
|
14
|
|
15
|
|
16 Bed2Fa::Bed2Fa(bool useName, const string &dbFile, const string &bedFile,
|
|
17 const string &fastaOutFile, bool useFasta, bool useStrand) {
|
|
18
|
|
19 _useName = useName;
|
|
20 _dbFile = dbFile;
|
|
21 _bedFile = bedFile;
|
|
22 _fastaOutFile = fastaOutFile;
|
|
23 _useFasta = useFasta;
|
|
24 _useStrand = useStrand;
|
|
25
|
|
26 _bed = new BedFile(_bedFile);
|
|
27
|
|
28 // Figure out what the output file should be.
|
|
29 if (fastaOutFile == "stdout") {
|
|
30 _faOut = &cout;
|
|
31 }
|
|
32 else {
|
|
33 // Make sure we can open the file.
|
|
34 ofstream fa(fastaOutFile.c_str(), ios::out);
|
|
35 if ( !fa ) {
|
|
36 cerr << "Error: The requested fasta output file (" << fastaOutFile << ") could not be opened. Exiting!" << endl;
|
|
37 exit (1);
|
|
38 }
|
|
39 else {
|
|
40 fa.close();
|
|
41 _faOut = new ofstream(fastaOutFile.c_str(), ios::out);
|
|
42 }
|
|
43 }
|
|
44
|
|
45 // Extract the requested intervals from the FASTA input file.
|
|
46 ExtractDNA();
|
|
47 }
|
|
48
|
|
49
|
|
50 Bed2Fa::~Bed2Fa(void) {
|
|
51 }
|
|
52
|
|
53
|
|
54 //******************************************************************************
|
|
55 // ReportDNA
|
|
56 //******************************************************************************
|
|
57 void Bed2Fa::ReportDNA(const BED &bed, string &dna) {
|
|
58
|
|
59 // revcomp if necessary. Thanks to Thomas Doktor.
|
|
60 if ((_useStrand == true) && (bed.strand == "-"))
|
|
61 reverseComplement(dna);
|
|
62
|
|
63 if (!(_useName)) {
|
|
64 if (_useFasta == true) {
|
|
65 if (_useStrand == true)
|
|
66 *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << endl << dna << endl;
|
|
67 else
|
|
68 *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
|
|
69 }
|
|
70 else {
|
|
71 if (_useStrand == true)
|
|
72 *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
|
|
73 else
|
|
74 *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
|
|
75 }
|
|
76 }
|
|
77 else {
|
|
78 if (_useFasta == true)
|
|
79 *_faOut << ">" << bed.name << endl << dna << endl;
|
|
80 else
|
|
81 *_faOut << bed.name << "\t" << dna << endl;
|
|
82 }
|
|
83 }
|
|
84
|
|
85
|
|
86
|
|
87 //******************************************************************************
|
|
88 // ExtractDNA
|
|
89 //******************************************************************************
|
|
90 void Bed2Fa::ExtractDNA() {
|
|
91
|
|
92 /* Make sure that we can oen all of the files successfully*/
|
|
93
|
|
94 // open the fasta database for reading
|
|
95 ifstream faDb(_dbFile.c_str(), ios::in);
|
|
96 if ( !faDb ) {
|
|
97 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
|
|
98 exit (1);
|
|
99 }
|
|
100
|
|
101 // open and memory-map genome file
|
|
102 FastaReference *fr = new FastaReference;
|
|
103 bool memmap = true;
|
|
104 fr->open(_dbFile, memmap);
|
|
105
|
|
106 BED bed, nullBed;
|
|
107 int lineNum = 0;
|
|
108 BedLineStatus bedStatus;
|
|
109 string sequence;
|
|
110
|
|
111 _bed->Open();
|
|
112 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
|
|
113 if (bedStatus == BED_VALID) {
|
|
114 // make sure we are extracting >= 1 bp
|
|
115 if (bed.zeroLength == false) {
|
|
116 size_t seqLength = fr->sequenceLength(bed.chrom);
|
|
117 // make sure this feature will not exceed the end of the chromosome.
|
|
118 if ( (bed.start <= seqLength) && (bed.end <= seqLength) )
|
|
119 {
|
|
120 int length = bed.end - bed.start;
|
|
121 sequence = fr->getSubSequence(bed.chrom, bed.start, length);
|
|
122 ReportDNA(bed, sequence);
|
|
123 }
|
|
124 else
|
|
125 {
|
|
126 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
|
|
127 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl;
|
|
128 }
|
|
129 }
|
|
130 // handle zeroLength
|
|
131 else {
|
|
132 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
|
|
133 }
|
|
134 bed = nullBed;
|
|
135 }
|
|
136 }
|
|
137 _bed->Close();
|
|
138 }
|
|
139
|
|
140
|
|
141
|