annotate BEDTools-Version-2.14.3/src/utils/Fasta/Fasta.h @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
1 // ***************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
2 // FastaIndex.h (c) 2010 Erik Garrison <erik.garrison@bc.edu>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
3 // Marth Lab, Department of Biology, Boston College
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
4 // All rights reserved.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
5 // ---------------------------------------------------------------------------
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
6 // Last modified: 5 February 2010 (EG)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
7 // ---------------------------------------------------------------------------
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
8
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
9 #ifndef _FASTA_H
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
10 #define _FASTA_H
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
11
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
12 #include <map>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
13 #include <iostream>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
14 #include <fstream>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
15 #include <vector>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
16 #include <stdint.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
17 #include <stdio.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
18 #include <algorithm>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
19 #include "LargeFileSupport.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
20 #include <sys/stat.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
21 #include <sys/mman.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
22 #include "split.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
23 #include <stdlib.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
24 #include <ctype.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
25 #include <unistd.h>
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
26
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
27 using namespace std;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
28
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
29 class FastaIndexEntry {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
30 friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
31 public:
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
32 FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
33 FastaIndexEntry(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
34 ~FastaIndexEntry(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
35 string name; // sequence name
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
36 int length; // length of sequence
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
37 long long offset; // bytes offset of sequence from start of file
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
38 int line_blen; // line length in bytes, sequence characters
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
39 int line_len; // line length including newline
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
40 void clear(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
41 };
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
42
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
43 class FastaIndex : public map<string, FastaIndexEntry> {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
44 friend ostream& operator<<(ostream& output, FastaIndex& i);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
45 public:
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
46 FastaIndex(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
47 ~FastaIndex(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
48 vector<string> sequenceNames;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
49 void indexReference(string refName);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
50 void readIndexFile(string fname);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
51 void writeIndexFile(string fname);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
52 ifstream indexFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
53 FastaIndexEntry entry(string key);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
54 void flushEntryToIndex(FastaIndexEntry& entry);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
55 string indexFileExtension(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
56 };
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
57
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
58 class FastaReference {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
59 public:
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
60 void open(string reffilename, bool usemmap = false);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
61 bool usingmmap;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
62 string filename;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
63 FastaReference(void) : usingmmap(false) { }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
64 ~FastaReference(void);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
65 FILE* file;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
66 void* filemm;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
67 size_t filesize;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
68 FastaIndex* index;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
69 vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
70 string getSequence(string seqname);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
71 // potentially useful for performance, investigate
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
72 // void getSequence(string seqname, string& sequence);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
73 string getSubSequence(string seqname, int start, int length);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
74 string sequenceNameStartingWith(string seqnameStart);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
75 long unsigned int sequenceLength(string seqname);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
76 };
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
77
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
78 #endif