b2b_summarize_assembly: sync

comparison sync_reads @ 1:5def63878840 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"

author	jdv
date	Tue, 28 Sep 2021 06:16:07 +0000
parents
children

comparison

equal deleted inserted replaced

-:114353d77370
+:5def63878840
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use 5.012;
+use autodie qw/open close/;
+use BioX::Seq::Stream;
+use Getopt::Long;
+use Pod::Usage;
+my $reads1;
+my $reads2;
+my $singles;
+my $sync_suffix    = 'sync';
+my $singles_suffix = 'singles';
+my $compress       = ''; # one of 'gzip' or 'dsrc'
+my $outname1;
+my $outname2;
+my $singles1_name;
+my $singles2_name;
+use constant DSRC_BIN => 'dsrc';
+use constant GZIP_BIN => 'gzip';
+use constant NAME     => 'sync_reads';
+use constant VERSION  => '0.005';
+GetOptions(
+'fwd=s'             => \$reads1,
+'rev=s'             => \$reads2,
+'singles'           => \$singles,
+'fwd_out=s'         => \$outname1,
+'rev_out=s'         => \$outname2,
+'fwd_singles_out=s' => \$singles1_name,
+'rev_singles_out=s' => \$singles2_name,
+'sync_suffix=s'     => \$sync_suffix,
+'singles_suffix=s'  => \$singles_suffix,
+'compress:s'        => \$compress,
+'help'              => sub{ pod2usage(-verbose => 2); },
+'version'           => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; },
+) or pod2usage( -verbose => 1 );
+pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read")
+if (! -r $reads1 || ! -r $reads2);
+# open output filehandles, possibly with compression
+if (! defined $outname1) {
+$outname1 = $reads1;
+$outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/;
+}
+if (! defined $outname2) {
+$outname2 = $reads2;
+$outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/;
+}
+my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
+my $cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
+: $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
+: '';
+my $suffix  = $compress eq 'gzip' ? '.gz'
+: $compress eq 'dsrc' ? '.dsrc'
+: '';
+open my $s1, $mode, $cmd . $outname1 . $suffix;
+open my $s2, $mode, $cmd . $outname2 . $suffix;
+# open singles output filehandles if requested, possibly with compression
+my $up1;
+my $up2;
+if ($singles) {
+if (! defined $singles1_name) {
+$singles1_name = $reads1;
+$singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
+}
+if (! defined $singles2_name) {
+$singles2_name = $reads2;
+$singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/;
+}
+$mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>';
+$cmd  = $compress eq 'gzip' ? GZIP_BIN . ' -c > '
+: $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s '
+: '';
+my $suffix  = $compress eq 'gzip' ? '.gz'
+: $compress eq 'dsrc' ? '.dsrc'
+: '';
+open $up1, $mode, $cmd . $singles1_name . $suffix;
+open $up2, $mode, $cmd . $singles2_name . $suffix;
+}
+my $ll_f1 = {};
+my $ll_f2 = {};
+my $f1_prev;
+my $f2_prev;
+my $f1_open = 1;
+my $f2_open = 1;
+my $parser1 = BioX::Seq::Stream->new($reads1);
+my $parser2 = BioX::Seq::Stream->new($reads2);
+while ($f1_open || $f2_open) {
+# process next read for file 1
+if ($f1_open && defined (my $seq = $parser1->next_seq)) {
+my $name = $seq->id;
+$name =~ s/\/[12]$//; #remove directional tag if present
+if (defined $ll_f2->{$name}) {
+my $prev = $ll_f2->{$name}->{prev} // undef;
+purge_cache( $ll_f2, $prev,    $up2 );
+purge_cache( $ll_f1, $f1_prev, $up1 );
+print {$s1} $seq->as_fastq;
+print {$s2} $ll_f2->{$name}->{read};
+delete $ll_f2->{$name};
+$f1_prev = undef;
+}
+else {
+$ll_f1->{$name}->{read} = $seq->as_fastq;
+$ll_f1->{$name}->{prev} = $f1_prev // undef;
+$f1_prev = $name;
+}
+}
+else {
+$f1_open = 0;
+}
+# process next read for file 2
+if ($f2_open && defined (my $seq = $parser2->next_seq)) {
+my $name = $seq->id;
+$name =~ s/\/[12]$//; #remove directional tag if present
+if (defined $ll_f1->{$name}) {
+my $prev = $ll_f1->{$name}->{prev} // undef;
+purge_cache( $ll_f1, $prev,    $up1 );
+purge_cache( $ll_f2, $f2_prev, $up2 );
+print {$s2} $seq->as_fastq;
+print {$s1} $ll_f1->{$name}->{read};
+delete $ll_f1->{$name};
+$f2_prev = undef;
+}
+else {
+$ll_f2->{$name}->{read} = $seq->as_fastq;
+$ll_f2->{$name}->{prev} = $f2_prev // undef;
+$f2_prev = $name;
+}
+}
+else {
+$f2_open = 0;
+}
+}
+#handle remaining unpaired reads if necessary
+if ($singles) {
+purge_cache( $ll_f1, $f1_prev, $up1 );
+purge_cache( $ll_f2, $f2_prev, $up2 );
+close $up1;
+close $up2;
+}
+exit;
+sub purge_cache {
+my ($ll, $prev, $fh) = @_;
+while (defined $prev && defined $ll->{$prev}) {
+my $prev2 = $ll->{$prev}->{prev} // undef;
+print {$fh} $ll->{$prev}->{read} if ($singles);
+delete $ll->{$prev};
+$prev = $prev2;
+}
+}
+__END__
+=head1 NAME
+sync_reads - resynchronize paired FASTQ files
+=head1 SYNOPSIS
+sync_reads [options] --fwd I<left_reads> --rev I<right reads>
+=head1 DESCRIPTION
+B<sync_reads> will re-synchronize two FASTQ files containing paired reads which
+are no longer in sync due to individual removal of reads during pre-processing
+(trimming, filtering, etc). In this case, "in sync" means that both files have
+the same number of reads and, at any given read position in the files, the
+corresponding reads represent proper pairs. The resulting files will contain
+matching reads in order (assuming the input files were properly ordered). It
+will optionally print out unpaired reads to separate files. Memory usage is
+not dependent on the input file size but rather the maximum distance between
+paired reads in the two files, as the read cache is flushed each time paired
+reads are identified. In the worst-case scenario (one file has a single read
+that pairs with the last read in the matching file) memory usage can approach
+the largest file size, but in typical usage it rarely exceeds a few MB
+regardless of file size.
+B<IMPORTANT:> Reads in input files MUST be in the same order, aside from
+missing reads, or the output will report many valid pairs as singletons.
+=head1 OPTIONS
+=head2 Mandatory
+=over 4
+=item B<--fwd> I<forward_fastq>
+Specify FASTQ file containing the first of the trimmed read pairs
+=item B<--rev> I<reverse_fastq>
+Specify FASTQ file containing the second of the trimmed read pairs
+=back
+=head2 Optional
+=over 4
+=item B<--fwd_out> I<filename>
+Specify output name for synced forward reads
+=item B<--rev_out> I<filename>
+Specify output name for synced reverse reads
+=item B<--fwd_singles_out> I<filename>
+Specify output name for forward singleton reads
+=item B<--rev_singles_out> I<filename>
+Specify output name for reverse singleton reads
+=item B<--sync_suffix> I<suffix>
+Specify suffix to add to synced read output files. This will be added to the
+input read name before the final suffix (i.e. after the last period). Default
+is 'sync'.
+=item B<--compress> I<gzip|dsrc>
+Specify type of compression for output files (will compress all output files)
+=item B<--singles>
+If given, unpaired reads will be written to separate output files. Default is
+FALSE.
+=item B<--singles_suffix> I<suffix>
+Specify suffix to add to singles read output files. This will be added to the
+input read name before the final suffix (i.e. after the last period). Default
+is 'singles'.
+=item B<--help>
+Display this usage page
+=item B<--version>
+Print version information
+=back
+=head1 CAVEATS AND BUGS
+Currently no input validation is performed on the input files. Files are
+assumed to be standard FASTQ file format with each read represented by four
+lines and no other extraneous information present. B<CRITICALLY>, they are also
+assumed to be in the same input order after accounting for deleted reads
+(the software will fail miserably if this is not the case).
+Please submit bug reports to the issue tracker in the distribution repository.
+=head1 AUTHOR
+Jeremy Volkening <jdv@base2bio.com>
+=head1 COPYRIGHT AND LICENSE
+Copyright 2014-16 Jeremy Volkening
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+=cut

Mercurial > repos > jdv > b2b_summarize_assembly

comparison sync_reads @ 1:5def63878840 draft default tip