Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/nucBed/nucBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
author | aaronquinlan |
---|---|
date | Sat, 19 Nov 2011 14:17:03 -0500 |
parents | dfcd8b6c1bda |
children |
comparison
equal
deleted
inserted
replaced
0:dfcd8b6c1bda | 1:bec36315bd12 |
---|---|
1 /***************************************************************************** | |
2 nucBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "nucBed.h" | |
14 | |
15 | |
16 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, | |
17 bool hasPattern, const string &pattern, bool forceStrand) { | |
18 | |
19 _dbFile = dbFile; | |
20 _bedFile = bedFile; | |
21 _printSeq = printSeq; | |
22 _hasPattern = hasPattern; | |
23 _pattern = pattern; | |
24 _forceStrand = forceStrand; | |
25 | |
26 _bed = new BedFile(_bedFile); | |
27 | |
28 // Compute the DNA content in each BED/GFF/VCF interval | |
29 ProfileDNA(); | |
30 } | |
31 | |
32 | |
33 NucBed::~NucBed(void) | |
34 {} | |
35 | |
36 | |
37 void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength) | |
38 { | |
39 int a,c,g,t,n,other,userPatternCount; | |
40 a = c = g = t = n = other = userPatternCount = 0; | |
41 | |
42 getDnaContent(sequence,a,c,g,t,n,other); | |
43 | |
44 if (_hasPattern) | |
45 userPatternCount = countPattern(sequence, _pattern); | |
46 | |
47 | |
48 // report the original interval | |
49 _bed->reportBedTab(bed); | |
50 // report AT and GC content | |
51 printf("%f\t%f\t",(float)(a+t)/seqLength, (float)(c+g)/seqLength); | |
52 // report raw nucleotide counts | |
53 printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength); | |
54 // add the original sequence if requested. | |
55 | |
56 if (_printSeq) | |
57 printf("\t%s",sequence.c_str()); | |
58 if (_hasPattern) | |
59 printf("\t%d",userPatternCount); | |
60 printf("\n"); | |
61 | |
62 } | |
63 | |
64 | |
65 void NucBed::PrintHeader(void) { | |
66 printf("#"); | |
67 | |
68 int numOrigColumns = (int) _bed->bedType; | |
69 for (int i = 1; i <= numOrigColumns; ++i) { | |
70 printf("%d_usercol\t", i); | |
71 } | |
72 printf("%d_pct_at\t", numOrigColumns + 1); | |
73 printf("%d_pct_gc\t", numOrigColumns + 2); | |
74 printf("%d_num_A\t", numOrigColumns + 3); | |
75 printf("%d_num_C\t", numOrigColumns + 4); | |
76 printf("%d_num_G\t", numOrigColumns + 5); | |
77 printf("%d_num_T\t", numOrigColumns + 6); | |
78 printf("%d_num_N\t", numOrigColumns + 7); | |
79 printf("%d_num_oth\t", numOrigColumns + 8); | |
80 printf("%d_seq_len\t", numOrigColumns + 9); | |
81 | |
82 if (_printSeq) | |
83 printf("%d_seq", numOrigColumns + 10); | |
84 if (_hasPattern && !_printSeq) | |
85 printf("%d_user_patt_count", numOrigColumns + 10); | |
86 else if (_hasPattern && _printSeq) | |
87 printf("\t%d_user_patt_count", numOrigColumns + 11); | |
88 printf("\n"); | |
89 | |
90 } | |
91 | |
92 | |
93 //****************************************************************************** | |
94 // ExtractDNA | |
95 //****************************************************************************** | |
96 void NucBed::ProfileDNA() { | |
97 | |
98 /* Make sure that we can oen all of the files successfully*/ | |
99 | |
100 // open the fasta database for reading | |
101 ifstream faDb(_dbFile.c_str(), ios::in); | |
102 if ( !faDb ) { | |
103 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; | |
104 exit (1); | |
105 } | |
106 | |
107 // open and memory-map genome file | |
108 FastaReference fr; | |
109 bool memmap = true; | |
110 fr.open(_dbFile, memmap); | |
111 | |
112 bool headerReported = false; | |
113 BED bed, nullBed; | |
114 int lineNum = 0; | |
115 BedLineStatus bedStatus; | |
116 string sequence; | |
117 | |
118 _bed->Open(); | |
119 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) { | |
120 if (bedStatus == BED_VALID) { | |
121 if (headerReported == false) { | |
122 PrintHeader(); | |
123 headerReported = true; | |
124 } | |
125 // make sure we are extracting >= 1 bp | |
126 if (bed.zeroLength == false) { | |
127 size_t seqLength = fr.sequenceLength(bed.chrom); | |
128 // make sure this feature will not exceed the end of the chromosome. | |
129 if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) | |
130 { | |
131 // grab the dna at this interval | |
132 int length = bed.end - bed.start; | |
133 // report the sequence's content | |
134 string dna = fr.getSubSequence(bed.chrom, bed.start, length); | |
135 // rev comp si necessaire | |
136 if ((_forceStrand == true) && (bed.strand == "-")) | |
137 reverseComplement(dna); | |
138 ReportDnaProfile(bed, dna, length); | |
139 bed = nullBed; | |
140 } | |
141 else | |
142 { | |
143 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of " | |
144 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl; | |
145 } | |
146 } | |
147 // handle zeroLength | |
148 else { | |
149 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl; | |
150 } | |
151 bed = nullBed; | |
152 } | |
153 } | |
154 _bed->Close(); | |
155 } | |
156 | |
157 | |
158 |