view BEDTools-Version-2.14.3/src/fastaFromBed/fastaFromBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line source

/*****************************************************************************
  fastaFromBed.cpp

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

  Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "fastaFromBed.h"


Bed2Fa::Bed2Fa(bool useName, const string &dbFile, const string &bedFile,
    const string &fastaOutFile, bool useFasta, bool useStrand) {

    _useName      = useName;
    _dbFile       = dbFile;
    _bedFile      = bedFile;
    _fastaOutFile = fastaOutFile;
    _useFasta     = useFasta;
    _useStrand    = useStrand;

    _bed = new BedFile(_bedFile);

    // Figure out what the output file should be.
    if (fastaOutFile == "stdout") {
        _faOut = &cout;
    }
    else {
        // Make sure we can open the file.
        ofstream fa(fastaOutFile.c_str(), ios::out);
        if ( !fa ) {
            cerr << "Error: The requested fasta output file (" << fastaOutFile << ") could not be opened. Exiting!" << endl;
            exit (1);
        }
        else {
            fa.close();
            _faOut = new ofstream(fastaOutFile.c_str(), ios::out);
        }
    }

    // Extract the requested intervals from the FASTA input file.
    ExtractDNA();
}


Bed2Fa::~Bed2Fa(void) {
}


//******************************************************************************
// ReportDNA
//******************************************************************************
void Bed2Fa::ReportDNA(const BED &bed, string &dna) {

    // revcomp if necessary.  Thanks to Thomas Doktor.
    if ((_useStrand == true) && (bed.strand == "-"))
        reverseComplement(dna);

    if (!(_useName)) {
        if (_useFasta == true) {
            if (_useStrand == true)
                *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end   << "(" << bed.strand << ")" << endl << dna << endl;
            else
                *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
        }
        else {
            if (_useStrand == true)
                *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
            else
                *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
        }
    }
    else {
        if (_useFasta == true)
            *_faOut << ">" << bed.name << endl << dna << endl;
        else
            *_faOut << bed.name << "\t" << dna << endl;
    }
}



//******************************************************************************
// ExtractDNA
//******************************************************************************
void Bed2Fa::ExtractDNA() {

    /* Make sure that we can oen all of the files successfully*/

    // open the fasta database for reading
    ifstream faDb(_dbFile.c_str(), ios::in);
    if ( !faDb ) {
        cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
        exit (1);
    }

    // open and memory-map genome file
    FastaReference *fr = new FastaReference;
    bool memmap = true;
    fr->open(_dbFile, memmap);

    BED bed, nullBed;
    int lineNum = 0;
    BedLineStatus bedStatus;
    string sequence;

    _bed->Open();
    while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
        if (bedStatus == BED_VALID) {
            // make sure we are extracting >= 1 bp
            if (bed.zeroLength == false) {
                size_t seqLength = fr->sequenceLength(bed.chrom);
                // make sure this feature will not exceed the end of the chromosome.
                if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) 
                {
                    int length = bed.end - bed.start;
                    sequence = fr->getSubSequence(bed.chrom, bed.start, length);
                    ReportDNA(bed, sequence);
                }
                else
                {
                    cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
                        << bed.chrom << " size (" << seqLength << " bp).  Skipping." << endl;
                }
            }
            // handle zeroLength 
            else {
                cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
            }
            bed = nullBed;
        }
    }
    _bed->Close();
}