Mercurial > repos > jdv > b2b_sync_reads

#!/usr/bin/env perl

use strict;
use warnings;
use 5.012;

use autodie qw/open close/;
use BioX::Seq::Stream;
use Getopt::Long;
use Pod::Usage;

my $reads1;
my $reads2;
my $singles;
my $sync_suffix    = 'sync';
my $singles_suffix = 'singles';
my $compress       = ''; # one of 'gzip' or 'dsrc'
my $outname1;
my $outname2;
my $singles1_name;
my $singles2_name;

use constant DSRC_BIN => 'dsrc';
use constant GZIP_BIN => 'gzip';
use constant NAME     => 'sync_reads';
use constant VERSION  => '0.005';

GetOptions(

    'fwd=s'             => \$reads1,
    'rev=s'             => \$reads2,
    'singles'           => \$singles,
    'fwd_out=s'         => \$outname1,
    'rev_out=s'         => \$outname2,
    'fwd_singles_out=s' => \$singles1_name,
    'rev_singles_out=s' => \$singles2_name,
    'sync_suffix=s'     => \$sync_suffix,
    'singles_suffix=s'  => \$singles_suffix,
    'compress:s'        => \$compress,
    'help'              => sub{ pod2usage(-verbose => 2); },
    'version'           => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; },

) or pod2usage( -verbose => 1 );

pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read")
    if (! -r $reads1 || ! -r $reads2);

# open output filehandles, possibly with compression
if (! defined $outname1) {
    $outname1 = $reads1;
    $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/;
}
if (! defined $outname2) {
    $outname2 = $reads2;
    $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/;
}
my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
my $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
      : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
      : '';
my $suffix  = $compress eq 'gzip' ? '.gz'
            : $compress eq 'dsrc' ? '.dsrc'
            : '';
open my $s1, $mode, $cmd . $outname1 . $suffix;
open my $s2, $mode, $cmd . $outname2 . $suffix;


# open singles output filehandles if requested, possibly with compression
my $up1;
my $up2;
if ($singles) {
    if (! defined $singles1_name) {
        $singles1_name = $reads1;
        $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
    }
    if (! defined $singles2_name) {
        $singles2_name = $reads2;
        $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
    }
    $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
    $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
          : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
        : '';
    my $suffix  = $compress eq 'gzip' ? '.gz'
                : $compress eq 'dsrc' ? '.dsrc'
                : '';
    open $up1, $mode, $cmd . $singles1_name . $suffix;
    open $up2, $mode, $cmd . $singles2_name . $suffix;
}

my $ll_f1 = {};
my $ll_f2 = {};
my $f1_prev;
my $f2_prev;
my $f1_open = 1;
my $f2_open = 1;

my $parser1 = BioX::Seq::Stream->new($reads1);
my $parser2 = BioX::Seq::Stream->new($reads2);

while ($f1_open || $f2_open) {

    # process next read for file 1
    if ($f1_open && defined (my $seq = $parser1->next_seq)) {
        my $name = $seq->id;
        $name =~ s/\/[12]$//; #remove directional tag if present
        if (defined $ll_f2->{$name}) {
            my $prev = $ll_f2->{$name}->{prev} // undef;
            purge_cache( $ll_f2, $prev,    $up2 );
            purge_cache( $ll_f1, $f1_prev, $up1 );
            print {$s1} $seq->as_fastq;
            print {$s2} $ll_f2->{$name}->{read};
            delete $ll_f2->{$name};
            $f1_prev = undef;
        }
        else {
            $ll_f1->{$name}->{read} = $seq->as_fastq;
            $ll_f1->{$name}->{prev} = $f1_prev // undef;
            $f1_prev = $name;
        }
    }
    else {
        $f1_open = 0;
    }

    # process next read for file 2
    if ($f2_open && defined (my $seq = $parser2->next_seq)) {
        my $name = $seq->id;
        $name =~ s/\/[12]$//; #remove directional tag if present
        if (defined $ll_f1->{$name}) {
            my $prev = $ll_f1->{$name}->{prev} // undef;
            purge_cache( $ll_f1, $prev,    $up1 );
            purge_cache( $ll_f2, $f2_prev, $up2 );
            print {$s2} $seq->as_fastq;
            print {$s1} $ll_f1->{$name}->{read};
            delete $ll_f1->{$name};
            $f2_prev = undef;
        }
        else {
            $ll_f2->{$name}->{read} = $seq->as_fastq;
            $ll_f2->{$name}->{prev} = $f2_prev // undef;
            $f2_prev = $name;
        }
    }
    else {
        $f2_open = 0;
    }
}

#handle remaining unpaired reads if necessary
if ($singles) {
    purge_cache( $ll_f1, $f1_prev, $up1 );
    purge_cache( $ll_f2, $f2_prev, $up2 );
    close $up1;
    close $up2;
}

exit;

sub purge_cache {

    my ($ll, $prev, $fh) = @_;
    while (defined $prev && defined $ll->{$prev}) {
        my $prev2 = $ll->{$prev}->{prev} // undef;
        print {$fh} $ll->{$prev}->{read} if ($singles);
        delete $ll->{$prev};
        $prev = $prev2;
    }

}


__END__

=head1 NAME

sync_reads - resynchronize paired FASTQ files

=head1 SYNOPSIS

sync_reads [options] --fwd I<left_reads> --rev I<right reads>

=head1 DESCRIPTION

B<sync_reads> will re-synchronize two FASTQ files containing paired reads which
are no longer in sync due to individual removal of reads during pre-processing
(trimming, filtering, etc). In this case, "in sync" means that both files have
the same number of reads and, at any given read position in the files, the
corresponding reads represent proper pairs. The resulting files will contain
matching reads in order (assuming the input files were properly ordered). It
will optionally print out unpaired reads to separate files. Memory usage is
not dependent on the input file size but rather the maximum distance between
paired reads in the two files, as the read cache is flushed each time paired
reads are identified. In the worst-case scenario (one file has a single read
that pairs with the last read in the matching file) memory usage can approach
the largest file size, but in typical usage it rarely exceeds a few MB
regardless of file size.

B<IMPORTANT:> Reads in input files MUST be in the same order, aside from
missing reads, or the output will report many valid pairs as singletons.

=head1 OPTIONS

=head2 Mandatory

=over 4

=item B<--fwd> I<forward_fastq>

Specify FASTQ file containing the first of the trimmed read pairs

=item B<--rev> I<reverse_fastq>

Specify FASTQ file containing the second of the trimmed read pairs

=back

=head2 Optional

=over 4

=item B<--fwd_out> I<filename>

Specify output name for synced forward reads

=item B<--rev_out> I<filename>

Specify output name for synced reverse reads

=item B<--fwd_singles_out> I<filename>

Specify output name for forward singleton reads

=item B<--rev_singles_out> I<filename>

Specify output name for reverse singleton reads

=item B<--sync_suffix> I<suffix>

Specify suffix to add to synced read output files. This will be added to the
input read name before the final suffix (i.e. after the last period). Default
is 'sync'.

=item B<--compress> I<gzip|dsrc>

Specify type of compression for output files (will compress all output files)

=item B<--singles>

If given, unpaired reads will be written to separate output files. Default is
FALSE.

=item B<--singles_suffix> I<suffix>

Specify suffix to add to singles read output files. This will be added to the
input read name before the final suffix (i.e. after the last period). Default
is 'singles'.

=item B<--help>

Display this usage page

=item B<--version>

Print version information

=back

=head1 CAVEATS AND BUGS

Currently no input validation is performed on the input files. Files are
assumed to be standard FASTQ file format with each read represented by four
lines and no other extraneous information present. B<CRITICALLY>, they are also
assumed to be in the same input order after accounting for deleted reads
(the software will fail miserably if this is not the case).

Please submit bug reports to the issue tracker in the distribution repository.

=head1 AUTHOR

Jeremy Volkening <jdv@base2bio.com>

=head1 COPYRIGHT AND LICENSE

Copyright 2014-16 Jeremy Volkening

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut