annotate BEDTools-Version-2.14.3/src/fastaFromBed/fastaFromBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
1 /*****************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
2 fastaFromBed.cpp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
3
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
4 (c) 2009 - Aaron Quinlan
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
5 Hall Laboratory
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
6 Department of Biochemistry and Molecular Genetics
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
7 University of Virginia
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
8 aaronquinlan@gmail.com
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
9
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
10 Licenced under the GNU General Public License 2.0 license.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
11 ******************************************************************************/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
12 #include "lineFileUtilities.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
13 #include "fastaFromBed.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
14
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
15
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
16 Bed2Fa::Bed2Fa(bool useName, const string &dbFile, const string &bedFile,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
17 const string &fastaOutFile, bool useFasta, bool useStrand) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
18
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
19 _useName = useName;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
20 _dbFile = dbFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
21 _bedFile = bedFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
22 _fastaOutFile = fastaOutFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
23 _useFasta = useFasta;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
24 _useStrand = useStrand;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
25
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
26 _bed = new BedFile(_bedFile);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
27
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
28 // Figure out what the output file should be.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
29 if (fastaOutFile == "stdout") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
30 _faOut = &cout;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
31 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
32 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
33 // Make sure we can open the file.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
34 ofstream fa(fastaOutFile.c_str(), ios::out);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
35 if ( !fa ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
36 cerr << "Error: The requested fasta output file (" << fastaOutFile << ") could not be opened. Exiting!" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
37 exit (1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
38 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
39 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
40 fa.close();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
41 _faOut = new ofstream(fastaOutFile.c_str(), ios::out);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
42 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
43 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
44
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
45 // Extract the requested intervals from the FASTA input file.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
46 ExtractDNA();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
47 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
48
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
49
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
50 Bed2Fa::~Bed2Fa(void) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
51 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
52
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
53
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
54 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
55 // ReportDNA
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
56 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
57 void Bed2Fa::ReportDNA(const BED &bed, string &dna) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
58
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
59 // revcomp if necessary. Thanks to Thomas Doktor.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
60 if ((_useStrand == true) && (bed.strand == "-"))
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
61 reverseComplement(dna);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
62
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
63 if (!(_useName)) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
64 if (_useFasta == true) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
65 if (_useStrand == true)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
66 *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << endl << dna << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
67 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
68 *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
69 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
70 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
71 if (_useStrand == true)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
72 *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
73 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
74 *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
75 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
76 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
77 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
78 if (_useFasta == true)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
79 *_faOut << ">" << bed.name << endl << dna << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
80 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
81 *_faOut << bed.name << "\t" << dna << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
82 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
83 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
84
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
85
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
86
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
87 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
88 // ExtractDNA
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
89 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
90 void Bed2Fa::ExtractDNA() {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
91
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
92 /* Make sure that we can oen all of the files successfully*/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
93
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
94 // open the fasta database for reading
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
95 ifstream faDb(_dbFile.c_str(), ios::in);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
96 if ( !faDb ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
97 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
98 exit (1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
99 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
100
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
101 // open and memory-map genome file
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
102 FastaReference *fr = new FastaReference;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
103 bool memmap = true;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
104 fr->open(_dbFile, memmap);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
105
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
106 BED bed, nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
107 int lineNum = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
108 BedLineStatus bedStatus;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
109 string sequence;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
110
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
111 _bed->Open();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
112 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
113 if (bedStatus == BED_VALID) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
114 // make sure we are extracting >= 1 bp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
115 if (bed.zeroLength == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
116 size_t seqLength = fr->sequenceLength(bed.chrom);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
117 // make sure this feature will not exceed the end of the chromosome.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
118 if ( (bed.start <= seqLength) && (bed.end <= seqLength) )
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
119 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
120 int length = bed.end - bed.start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
121 sequence = fr->getSubSequence(bed.chrom, bed.start, length);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
122 ReportDNA(bed, sequence);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
123 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
124 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
125 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
126 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
127 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
128 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
129 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
130 // handle zeroLength
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
131 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
132 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
133 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
134 bed = nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
135 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
136 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
137 _bed->Close();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
138 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
139
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
140
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
141