Mercurial > repos > jdv > b2b_summarize_assembly
diff sync_reads @ 1:5def63878840 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author | jdv |
---|---|
date | Tue, 28 Sep 2021 06:16:07 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sync_reads Tue Sep 28 06:16:07 2021 +0000 @@ -0,0 +1,300 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use autodie qw/open close/; +use BioX::Seq::Stream; +use Getopt::Long; +use Pod::Usage; + +my $reads1; +my $reads2; +my $singles; +my $sync_suffix = 'sync'; +my $singles_suffix = 'singles'; +my $compress = ''; # one of 'gzip' or 'dsrc' +my $outname1; +my $outname2; +my $singles1_name; +my $singles2_name; + +use constant DSRC_BIN => 'dsrc'; +use constant GZIP_BIN => 'gzip'; +use constant NAME => 'sync_reads'; +use constant VERSION => '0.005'; + +GetOptions( + + 'fwd=s' => \$reads1, + 'rev=s' => \$reads2, + 'singles' => \$singles, + 'fwd_out=s' => \$outname1, + 'rev_out=s' => \$outname2, + 'fwd_singles_out=s' => \$singles1_name, + 'rev_singles_out=s' => \$singles2_name, + 'sync_suffix=s' => \$sync_suffix, + 'singles_suffix=s' => \$singles_suffix, + 'compress:s' => \$compress, + 'help' => sub{ pod2usage(-verbose => 2); }, + 'version' => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; }, + +) or pod2usage( -verbose => 1 ); + +pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read") + if (! -r $reads1 || ! -r $reads2); + +# open output filehandles, possibly with compression +if (! defined $outname1) { + $outname1 = $reads1; + $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/; +} +if (! defined $outname2) { + $outname2 = $reads2; + $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/; +} +my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; +my $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' + : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' + : ''; +my $suffix = $compress eq 'gzip' ? '.gz' + : $compress eq 'dsrc' ? '.dsrc' + : ''; +open my $s1, $mode, $cmd . $outname1 . $suffix; +open my $s2, $mode, $cmd . $outname2 . $suffix; + + +# open singles output filehandles if requested, possibly with compression +my $up1; +my $up2; +if ($singles) { + if (! defined $singles1_name) { + $singles1_name = $reads1; + $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/; + } + if (! defined $singles2_name) { + $singles2_name = $reads2; + $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/; + } + $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; + $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' + : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' + : ''; + my $suffix = $compress eq 'gzip' ? '.gz' + : $compress eq 'dsrc' ? '.dsrc' + : ''; + open $up1, $mode, $cmd . $singles1_name . $suffix; + open $up2, $mode, $cmd . $singles2_name . $suffix; +} + +my $ll_f1 = {}; +my $ll_f2 = {}; +my $f1_prev; +my $f2_prev; +my $f1_open = 1; +my $f2_open = 1; + +my $parser1 = BioX::Seq::Stream->new($reads1); +my $parser2 = BioX::Seq::Stream->new($reads2); + +while ($f1_open || $f2_open) { + + # process next read for file 1 + if ($f1_open && defined (my $seq = $parser1->next_seq)) { + my $name = $seq->id; + $name =~ s/\/[12]$//; #remove directional tag if present + if (defined $ll_f2->{$name}) { + my $prev = $ll_f2->{$name}->{prev} // undef; + purge_cache( $ll_f2, $prev, $up2 ); + purge_cache( $ll_f1, $f1_prev, $up1 ); + print {$s1} $seq->as_fastq; + print {$s2} $ll_f2->{$name}->{read}; + delete $ll_f2->{$name}; + $f1_prev = undef; + } + else { + $ll_f1->{$name}->{read} = $seq->as_fastq; + $ll_f1->{$name}->{prev} = $f1_prev // undef; + $f1_prev = $name; + } + } + else { + $f1_open = 0; + } + + # process next read for file 2 + if ($f2_open && defined (my $seq = $parser2->next_seq)) { + my $name = $seq->id; + $name =~ s/\/[12]$//; #remove directional tag if present + if (defined $ll_f1->{$name}) { + my $prev = $ll_f1->{$name}->{prev} // undef; + purge_cache( $ll_f1, $prev, $up1 ); + purge_cache( $ll_f2, $f2_prev, $up2 ); + print {$s2} $seq->as_fastq; + print {$s1} $ll_f1->{$name}->{read}; + delete $ll_f1->{$name}; + $f2_prev = undef; + } + else { + $ll_f2->{$name}->{read} = $seq->as_fastq; + $ll_f2->{$name}->{prev} = $f2_prev // undef; + $f2_prev = $name; + } + } + else { + $f2_open = 0; + } +} + +#handle remaining unpaired reads if necessary +if ($singles) { + purge_cache( $ll_f1, $f1_prev, $up1 ); + purge_cache( $ll_f2, $f2_prev, $up2 ); + close $up1; + close $up2; +} + +exit; + +sub purge_cache { + + my ($ll, $prev, $fh) = @_; + while (defined $prev && defined $ll->{$prev}) { + my $prev2 = $ll->{$prev}->{prev} // undef; + print {$fh} $ll->{$prev}->{read} if ($singles); + delete $ll->{$prev}; + $prev = $prev2; + } + +} + + +__END__ + +=head1 NAME + +sync_reads - resynchronize paired FASTQ files + +=head1 SYNOPSIS + +sync_reads [options] --fwd I<left_reads> --rev I<right reads> + +=head1 DESCRIPTION + +B<sync_reads> will re-synchronize two FASTQ files containing paired reads which +are no longer in sync due to individual removal of reads during pre-processing +(trimming, filtering, etc). In this case, "in sync" means that both files have +the same number of reads and, at any given read position in the files, the +corresponding reads represent proper pairs. The resulting files will contain +matching reads in order (assuming the input files were properly ordered). It +will optionally print out unpaired reads to separate files. Memory usage is +not dependent on the input file size but rather the maximum distance between +paired reads in the two files, as the read cache is flushed each time paired +reads are identified. In the worst-case scenario (one file has a single read +that pairs with the last read in the matching file) memory usage can approach +the largest file size, but in typical usage it rarely exceeds a few MB +regardless of file size. + +B<IMPORTANT:> Reads in input files MUST be in the same order, aside from +missing reads, or the output will report many valid pairs as singletons. + +=head1 OPTIONS + +=head2 Mandatory + +=over 4 + +=item B<--fwd> I<forward_fastq> + +Specify FASTQ file containing the first of the trimmed read pairs + +=item B<--rev> I<reverse_fastq> + +Specify FASTQ file containing the second of the trimmed read pairs + +=back + +=head2 Optional + +=over 4 + +=item B<--fwd_out> I<filename> + +Specify output name for synced forward reads + +=item B<--rev_out> I<filename> + +Specify output name for synced reverse reads + +=item B<--fwd_singles_out> I<filename> + +Specify output name for forward singleton reads + +=item B<--rev_singles_out> I<filename> + +Specify output name for reverse singleton reads + +=item B<--sync_suffix> I<suffix> + +Specify suffix to add to synced read output files. This will be added to the +input read name before the final suffix (i.e. after the last period). Default +is 'sync'. + +=item B<--compress> I<gzip|dsrc> + +Specify type of compression for output files (will compress all output files) + +=item B<--singles> + +If given, unpaired reads will be written to separate output files. Default is +FALSE. + +=item B<--singles_suffix> I<suffix> + +Specify suffix to add to singles read output files. This will be added to the +input read name before the final suffix (i.e. after the last period). Default +is 'singles'. + +=item B<--help> + +Display this usage page + +=item B<--version> + +Print version information + +=back + +=head1 CAVEATS AND BUGS + +Currently no input validation is performed on the input files. Files are +assumed to be standard FASTQ file format with each read represented by four +lines and no other extraneous information present. B<CRITICALLY>, they are also +assumed to be in the same input order after accounting for deleted reads +(the software will fail miserably if this is not the case). + +Please submit bug reports to the issue tracker in the distribution repository. + +=head1 AUTHOR + +Jeremy Volkening <jdv@base2bio.com> + +=head1 COPYRIGHT AND LICENSE + +Copyright 2014-16 Jeremy Volkening + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. + +=cut