annotate BEDTools-Version-2.14.3/src/nucBed/nucBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
1 /*****************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
2 nucBed.cpp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
3
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
4 (c) 2009 - Aaron Quinlan
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
5 Hall Laboratory
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
6 Department of Biochemistry and Molecular Genetics
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
7 University of Virginia
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
8 aaronquinlan@gmail.com
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
9
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
10 Licenced under the GNU General Public License 2.0 license.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
11 ******************************************************************************/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
12 #include "lineFileUtilities.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
13 #include "nucBed.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
14
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
15
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
16 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
17 bool hasPattern, const string &pattern, bool forceStrand) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
18
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
19 _dbFile = dbFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
20 _bedFile = bedFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
21 _printSeq = printSeq;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
22 _hasPattern = hasPattern;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
23 _pattern = pattern;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
24 _forceStrand = forceStrand;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
25
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
26 _bed = new BedFile(_bedFile);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
27
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
28 // Compute the DNA content in each BED/GFF/VCF interval
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
29 ProfileDNA();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
30 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
31
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
32
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
33 NucBed::~NucBed(void)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
34 {}
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
35
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
36
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
37 void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
38 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
39 int a,c,g,t,n,other,userPatternCount;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
40 a = c = g = t = n = other = userPatternCount = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
41
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
42 getDnaContent(sequence,a,c,g,t,n,other);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
43
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
44 if (_hasPattern)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
45 userPatternCount = countPattern(sequence, _pattern);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
46
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
47
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
48 // report the original interval
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
49 _bed->reportBedTab(bed);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
50 // report AT and GC content
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
51 printf("%f\t%f\t",(float)(a+t)/seqLength, (float)(c+g)/seqLength);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
52 // report raw nucleotide counts
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
53 printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
54 // add the original sequence if requested.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
55
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
56 if (_printSeq)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
57 printf("\t%s",sequence.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
58 if (_hasPattern)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
59 printf("\t%d",userPatternCount);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
60 printf("\n");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
61
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
62 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
63
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
64
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
65 void NucBed::PrintHeader(void) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
66 printf("#");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
67
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
68 int numOrigColumns = (int) _bed->bedType;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
69 for (int i = 1; i <= numOrigColumns; ++i) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
70 printf("%d_usercol\t", i);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
71 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
72 printf("%d_pct_at\t", numOrigColumns + 1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
73 printf("%d_pct_gc\t", numOrigColumns + 2);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
74 printf("%d_num_A\t", numOrigColumns + 3);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
75 printf("%d_num_C\t", numOrigColumns + 4);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
76 printf("%d_num_G\t", numOrigColumns + 5);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
77 printf("%d_num_T\t", numOrigColumns + 6);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
78 printf("%d_num_N\t", numOrigColumns + 7);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
79 printf("%d_num_oth\t", numOrigColumns + 8);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
80 printf("%d_seq_len\t", numOrigColumns + 9);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
81
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
82 if (_printSeq)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
83 printf("%d_seq", numOrigColumns + 10);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
84 if (_hasPattern && !_printSeq)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
85 printf("%d_user_patt_count", numOrigColumns + 10);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
86 else if (_hasPattern && _printSeq)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
87 printf("\t%d_user_patt_count", numOrigColumns + 11);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
88 printf("\n");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
89
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
90 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
91
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
92
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
93 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
94 // ExtractDNA
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
95 //******************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
96 void NucBed::ProfileDNA() {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
97
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
98 /* Make sure that we can oen all of the files successfully*/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
99
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
100 // open the fasta database for reading
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
101 ifstream faDb(_dbFile.c_str(), ios::in);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
102 if ( !faDb ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
103 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
104 exit (1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
105 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
106
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
107 // open and memory-map genome file
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
108 FastaReference fr;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
109 bool memmap = true;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
110 fr.open(_dbFile, memmap);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
111
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
112 bool headerReported = false;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
113 BED bed, nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
114 int lineNum = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
115 BedLineStatus bedStatus;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
116 string sequence;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
117
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
118 _bed->Open();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
119 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
120 if (bedStatus == BED_VALID) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
121 if (headerReported == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
122 PrintHeader();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
123 headerReported = true;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
124 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
125 // make sure we are extracting >= 1 bp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
126 if (bed.zeroLength == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
127 size_t seqLength = fr.sequenceLength(bed.chrom);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
128 // make sure this feature will not exceed the end of the chromosome.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
129 if ( (bed.start <= seqLength) && (bed.end <= seqLength) )
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
130 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
131 // grab the dna at this interval
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
132 int length = bed.end - bed.start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
133 // report the sequence's content
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
134 string dna = fr.getSubSequence(bed.chrom, bed.start, length);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
135 // rev comp si necessaire
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
136 if ((_forceStrand == true) && (bed.strand == "-"))
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
137 reverseComplement(dna);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
138 ReportDnaProfile(bed, dna, length);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
139 bed = nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
140 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
141 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
142 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
143 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
144 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
145 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
146 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
147 // handle zeroLength
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
148 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
149 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
150 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
151 bed = nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
152 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
153 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
154 _bed->Close();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
155 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
156
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
157
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
158