comparison BEDTools-Version-2.14.3/src/nucBed/nucBed.cpp @ 1:bec36315bd12 default tip

Deleted selected files
author aaronquinlan
date Sat, 19 Nov 2011 14:17:03 -0500
parents dfcd8b6c1bda
children
comparison
equal deleted inserted replaced
0:dfcd8b6c1bda 1:bec36315bd12
1 /*****************************************************************************
2 nucBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "nucBed.h"
14
15
16 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq,
17 bool hasPattern, const string &pattern, bool forceStrand) {
18
19 _dbFile = dbFile;
20 _bedFile = bedFile;
21 _printSeq = printSeq;
22 _hasPattern = hasPattern;
23 _pattern = pattern;
24 _forceStrand = forceStrand;
25
26 _bed = new BedFile(_bedFile);
27
28 // Compute the DNA content in each BED/GFF/VCF interval
29 ProfileDNA();
30 }
31
32
33 NucBed::~NucBed(void)
34 {}
35
36
37 void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength)
38 {
39 int a,c,g,t,n,other,userPatternCount;
40 a = c = g = t = n = other = userPatternCount = 0;
41
42 getDnaContent(sequence,a,c,g,t,n,other);
43
44 if (_hasPattern)
45 userPatternCount = countPattern(sequence, _pattern);
46
47
48 // report the original interval
49 _bed->reportBedTab(bed);
50 // report AT and GC content
51 printf("%f\t%f\t",(float)(a+t)/seqLength, (float)(c+g)/seqLength);
52 // report raw nucleotide counts
53 printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength);
54 // add the original sequence if requested.
55
56 if (_printSeq)
57 printf("\t%s",sequence.c_str());
58 if (_hasPattern)
59 printf("\t%d",userPatternCount);
60 printf("\n");
61
62 }
63
64
65 void NucBed::PrintHeader(void) {
66 printf("#");
67
68 int numOrigColumns = (int) _bed->bedType;
69 for (int i = 1; i <= numOrigColumns; ++i) {
70 printf("%d_usercol\t", i);
71 }
72 printf("%d_pct_at\t", numOrigColumns + 1);
73 printf("%d_pct_gc\t", numOrigColumns + 2);
74 printf("%d_num_A\t", numOrigColumns + 3);
75 printf("%d_num_C\t", numOrigColumns + 4);
76 printf("%d_num_G\t", numOrigColumns + 5);
77 printf("%d_num_T\t", numOrigColumns + 6);
78 printf("%d_num_N\t", numOrigColumns + 7);
79 printf("%d_num_oth\t", numOrigColumns + 8);
80 printf("%d_seq_len\t", numOrigColumns + 9);
81
82 if (_printSeq)
83 printf("%d_seq", numOrigColumns + 10);
84 if (_hasPattern && !_printSeq)
85 printf("%d_user_patt_count", numOrigColumns + 10);
86 else if (_hasPattern && _printSeq)
87 printf("\t%d_user_patt_count", numOrigColumns + 11);
88 printf("\n");
89
90 }
91
92
93 //******************************************************************************
94 // ExtractDNA
95 //******************************************************************************
96 void NucBed::ProfileDNA() {
97
98 /* Make sure that we can oen all of the files successfully*/
99
100 // open the fasta database for reading
101 ifstream faDb(_dbFile.c_str(), ios::in);
102 if ( !faDb ) {
103 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
104 exit (1);
105 }
106
107 // open and memory-map genome file
108 FastaReference fr;
109 bool memmap = true;
110 fr.open(_dbFile, memmap);
111
112 bool headerReported = false;
113 BED bed, nullBed;
114 int lineNum = 0;
115 BedLineStatus bedStatus;
116 string sequence;
117
118 _bed->Open();
119 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
120 if (bedStatus == BED_VALID) {
121 if (headerReported == false) {
122 PrintHeader();
123 headerReported = true;
124 }
125 // make sure we are extracting >= 1 bp
126 if (bed.zeroLength == false) {
127 size_t seqLength = fr.sequenceLength(bed.chrom);
128 // make sure this feature will not exceed the end of the chromosome.
129 if ( (bed.start <= seqLength) && (bed.end <= seqLength) )
130 {
131 // grab the dna at this interval
132 int length = bed.end - bed.start;
133 // report the sequence's content
134 string dna = fr.getSubSequence(bed.chrom, bed.start, length);
135 // rev comp si necessaire
136 if ((_forceStrand == true) && (bed.strand == "-"))
137 reverseComplement(dna);
138 ReportDnaProfile(bed, dna, length);
139 bed = nullBed;
140 }
141 else
142 {
143 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
144 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl;
145 }
146 }
147 // handle zeroLength
148 else {
149 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
150 }
151 bed = nullBed;
152 }
153 }
154 _bed->Close();
155 }
156
157
158