Mercurial > repos > jdv > b2b_sync_reads
view sync_reads @ 1:b7f66945bf72 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author | jdv |
---|---|
date | Tue, 28 Sep 2021 06:14:59 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; use 5.012; use autodie qw/open close/; use BioX::Seq::Stream; use Getopt::Long; use Pod::Usage; my $reads1; my $reads2; my $singles; my $sync_suffix = 'sync'; my $singles_suffix = 'singles'; my $compress = ''; # one of 'gzip' or 'dsrc' my $outname1; my $outname2; my $singles1_name; my $singles2_name; use constant DSRC_BIN => 'dsrc'; use constant GZIP_BIN => 'gzip'; use constant NAME => 'sync_reads'; use constant VERSION => '0.005'; GetOptions( 'fwd=s' => \$reads1, 'rev=s' => \$reads2, 'singles' => \$singles, 'fwd_out=s' => \$outname1, 'rev_out=s' => \$outname2, 'fwd_singles_out=s' => \$singles1_name, 'rev_singles_out=s' => \$singles2_name, 'sync_suffix=s' => \$sync_suffix, 'singles_suffix=s' => \$singles_suffix, 'compress:s' => \$compress, 'help' => sub{ pod2usage(-verbose => 2); }, 'version' => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; }, ) or pod2usage( -verbose => 1 ); pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read") if (! -r $reads1 || ! -r $reads2); # open output filehandles, possibly with compression if (! defined $outname1) { $outname1 = $reads1; $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/; } if (! defined $outname2) { $outname2 = $reads2; $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/; } my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; my $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' : ''; my $suffix = $compress eq 'gzip' ? '.gz' : $compress eq 'dsrc' ? '.dsrc' : ''; open my $s1, $mode, $cmd . $outname1 . $suffix; open my $s2, $mode, $cmd . $outname2 . $suffix; # open singles output filehandles if requested, possibly with compression my $up1; my $up2; if ($singles) { if (! defined $singles1_name) { $singles1_name = $reads1; $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/; } if (! defined $singles2_name) { $singles2_name = $reads2; $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/; } $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' : ''; my $suffix = $compress eq 'gzip' ? '.gz' : $compress eq 'dsrc' ? '.dsrc' : ''; open $up1, $mode, $cmd . $singles1_name . $suffix; open $up2, $mode, $cmd . $singles2_name . $suffix; } my $ll_f1 = {}; my $ll_f2 = {}; my $f1_prev; my $f2_prev; my $f1_open = 1; my $f2_open = 1; my $parser1 = BioX::Seq::Stream->new($reads1); my $parser2 = BioX::Seq::Stream->new($reads2); while ($f1_open || $f2_open) { # process next read for file 1 if ($f1_open && defined (my $seq = $parser1->next_seq)) { my $name = $seq->id; $name =~ s/\/[12]$//; #remove directional tag if present if (defined $ll_f2->{$name}) { my $prev = $ll_f2->{$name}->{prev} // undef; purge_cache( $ll_f2, $prev, $up2 ); purge_cache( $ll_f1, $f1_prev, $up1 ); print {$s1} $seq->as_fastq; print {$s2} $ll_f2->{$name}->{read}; delete $ll_f2->{$name}; $f1_prev = undef; } else { $ll_f1->{$name}->{read} = $seq->as_fastq; $ll_f1->{$name}->{prev} = $f1_prev // undef; $f1_prev = $name; } } else { $f1_open = 0; } # process next read for file 2 if ($f2_open && defined (my $seq = $parser2->next_seq)) { my $name = $seq->id; $name =~ s/\/[12]$//; #remove directional tag if present if (defined $ll_f1->{$name}) { my $prev = $ll_f1->{$name}->{prev} // undef; purge_cache( $ll_f1, $prev, $up1 ); purge_cache( $ll_f2, $f2_prev, $up2 ); print {$s2} $seq->as_fastq; print {$s1} $ll_f1->{$name}->{read}; delete $ll_f1->{$name}; $f2_prev = undef; } else { $ll_f2->{$name}->{read} = $seq->as_fastq; $ll_f2->{$name}->{prev} = $f2_prev // undef; $f2_prev = $name; } } else { $f2_open = 0; } } #handle remaining unpaired reads if necessary if ($singles) { purge_cache( $ll_f1, $f1_prev, $up1 ); purge_cache( $ll_f2, $f2_prev, $up2 ); close $up1; close $up2; } exit; sub purge_cache { my ($ll, $prev, $fh) = @_; while (defined $prev && defined $ll->{$prev}) { my $prev2 = $ll->{$prev}->{prev} // undef; print {$fh} $ll->{$prev}->{read} if ($singles); delete $ll->{$prev}; $prev = $prev2; } } __END__ =head1 NAME sync_reads - resynchronize paired FASTQ files =head1 SYNOPSIS sync_reads [options] --fwd I<left_reads> --rev I<right reads> =head1 DESCRIPTION B<sync_reads> will re-synchronize two FASTQ files containing paired reads which are no longer in sync due to individual removal of reads during pre-processing (trimming, filtering, etc). In this case, "in sync" means that both files have the same number of reads and, at any given read position in the files, the corresponding reads represent proper pairs. The resulting files will contain matching reads in order (assuming the input files were properly ordered). It will optionally print out unpaired reads to separate files. Memory usage is not dependent on the input file size but rather the maximum distance between paired reads in the two files, as the read cache is flushed each time paired reads are identified. In the worst-case scenario (one file has a single read that pairs with the last read in the matching file) memory usage can approach the largest file size, but in typical usage it rarely exceeds a few MB regardless of file size. B<IMPORTANT:> Reads in input files MUST be in the same order, aside from missing reads, or the output will report many valid pairs as singletons. =head1 OPTIONS =head2 Mandatory =over 4 =item B<--fwd> I<forward_fastq> Specify FASTQ file containing the first of the trimmed read pairs =item B<--rev> I<reverse_fastq> Specify FASTQ file containing the second of the trimmed read pairs =back =head2 Optional =over 4 =item B<--fwd_out> I<filename> Specify output name for synced forward reads =item B<--rev_out> I<filename> Specify output name for synced reverse reads =item B<--fwd_singles_out> I<filename> Specify output name for forward singleton reads =item B<--rev_singles_out> I<filename> Specify output name for reverse singleton reads =item B<--sync_suffix> I<suffix> Specify suffix to add to synced read output files. This will be added to the input read name before the final suffix (i.e. after the last period). Default is 'sync'. =item B<--compress> I<gzip|dsrc> Specify type of compression for output files (will compress all output files) =item B<--singles> If given, unpaired reads will be written to separate output files. Default is FALSE. =item B<--singles_suffix> I<suffix> Specify suffix to add to singles read output files. This will be added to the input read name before the final suffix (i.e. after the last period). Default is 'singles'. =item B<--help> Display this usage page =item B<--version> Print version information =back =head1 CAVEATS AND BUGS Currently no input validation is performed on the input files. Files are assumed to be standard FASTQ file format with each read represented by four lines and no other extraneous information present. B<CRITICALLY>, they are also assumed to be in the same input order after accounting for deleted reads (the software will fail miserably if this is not the case). Please submit bug reports to the issue tracker in the distribution repository. =head1 AUTHOR Jeremy Volkening <jdv@base2bio.com> =head1 COPYRIGHT AND LICENSE Copyright 2014-16 Jeremy Volkening This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. =cut