Mercurial > repos > jdv > b2b_summarize_run
comparison sync_reads @ 1:10c319d654df draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
| author | jdv |
|---|---|
| date | Tue, 28 Sep 2021 06:13:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:bd599efae04d | 1:10c319d654df |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use 5.012; | |
| 6 | |
| 7 use autodie qw/open close/; | |
| 8 use BioX::Seq::Stream; | |
| 9 use Getopt::Long; | |
| 10 use Pod::Usage; | |
| 11 | |
| 12 my $reads1; | |
| 13 my $reads2; | |
| 14 my $singles; | |
| 15 my $sync_suffix = 'sync'; | |
| 16 my $singles_suffix = 'singles'; | |
| 17 my $compress = ''; # one of 'gzip' or 'dsrc' | |
| 18 my $outname1; | |
| 19 my $outname2; | |
| 20 my $singles1_name; | |
| 21 my $singles2_name; | |
| 22 | |
| 23 use constant DSRC_BIN => 'dsrc'; | |
| 24 use constant GZIP_BIN => 'gzip'; | |
| 25 use constant NAME => 'sync_reads'; | |
| 26 use constant VERSION => '0.005'; | |
| 27 | |
| 28 GetOptions( | |
| 29 | |
| 30 'fwd=s' => \$reads1, | |
| 31 'rev=s' => \$reads2, | |
| 32 'singles' => \$singles, | |
| 33 'fwd_out=s' => \$outname1, | |
| 34 'rev_out=s' => \$outname2, | |
| 35 'fwd_singles_out=s' => \$singles1_name, | |
| 36 'rev_singles_out=s' => \$singles2_name, | |
| 37 'sync_suffix=s' => \$sync_suffix, | |
| 38 'singles_suffix=s' => \$singles_suffix, | |
| 39 'compress:s' => \$compress, | |
| 40 'help' => sub{ pod2usage(-verbose => 2); }, | |
| 41 'version' => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; }, | |
| 42 | |
| 43 ) or pod2usage( -verbose => 1 ); | |
| 44 | |
| 45 pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read") | |
| 46 if (! -r $reads1 || ! -r $reads2); | |
| 47 | |
| 48 # open output filehandles, possibly with compression | |
| 49 if (! defined $outname1) { | |
| 50 $outname1 = $reads1; | |
| 51 $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/; | |
| 52 } | |
| 53 if (! defined $outname2) { | |
| 54 $outname2 = $reads2; | |
| 55 $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/; | |
| 56 } | |
| 57 my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; | |
| 58 my $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' | |
| 59 : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' | |
| 60 : ''; | |
| 61 my $suffix = $compress eq 'gzip' ? '.gz' | |
| 62 : $compress eq 'dsrc' ? '.dsrc' | |
| 63 : ''; | |
| 64 open my $s1, $mode, $cmd . $outname1 . $suffix; | |
| 65 open my $s2, $mode, $cmd . $outname2 . $suffix; | |
| 66 | |
| 67 | |
| 68 # open singles output filehandles if requested, possibly with compression | |
| 69 my $up1; | |
| 70 my $up2; | |
| 71 if ($singles) { | |
| 72 if (! defined $singles1_name) { | |
| 73 $singles1_name = $reads1; | |
| 74 $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/; | |
| 75 } | |
| 76 if (! defined $singles2_name) { | |
| 77 $singles2_name = $reads2; | |
| 78 $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/; | |
| 79 } | |
| 80 $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; | |
| 81 $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' | |
| 82 : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' | |
| 83 : ''; | |
| 84 my $suffix = $compress eq 'gzip' ? '.gz' | |
| 85 : $compress eq 'dsrc' ? '.dsrc' | |
| 86 : ''; | |
| 87 open $up1, $mode, $cmd . $singles1_name . $suffix; | |
| 88 open $up2, $mode, $cmd . $singles2_name . $suffix; | |
| 89 } | |
| 90 | |
| 91 my $ll_f1 = {}; | |
| 92 my $ll_f2 = {}; | |
| 93 my $f1_prev; | |
| 94 my $f2_prev; | |
| 95 my $f1_open = 1; | |
| 96 my $f2_open = 1; | |
| 97 | |
| 98 my $parser1 = BioX::Seq::Stream->new($reads1); | |
| 99 my $parser2 = BioX::Seq::Stream->new($reads2); | |
| 100 | |
| 101 while ($f1_open || $f2_open) { | |
| 102 | |
| 103 # process next read for file 1 | |
| 104 if ($f1_open && defined (my $seq = $parser1->next_seq)) { | |
| 105 my $name = $seq->id; | |
| 106 $name =~ s/\/[12]$//; #remove directional tag if present | |
| 107 if (defined $ll_f2->{$name}) { | |
| 108 my $prev = $ll_f2->{$name}->{prev} // undef; | |
| 109 purge_cache( $ll_f2, $prev, $up2 ); | |
| 110 purge_cache( $ll_f1, $f1_prev, $up1 ); | |
| 111 print {$s1} $seq->as_fastq; | |
| 112 print {$s2} $ll_f2->{$name}->{read}; | |
| 113 delete $ll_f2->{$name}; | |
| 114 $f1_prev = undef; | |
| 115 } | |
| 116 else { | |
| 117 $ll_f1->{$name}->{read} = $seq->as_fastq; | |
| 118 $ll_f1->{$name}->{prev} = $f1_prev // undef; | |
| 119 $f1_prev = $name; | |
| 120 } | |
| 121 } | |
| 122 else { | |
| 123 $f1_open = 0; | |
| 124 } | |
| 125 | |
| 126 # process next read for file 2 | |
| 127 if ($f2_open && defined (my $seq = $parser2->next_seq)) { | |
| 128 my $name = $seq->id; | |
| 129 $name =~ s/\/[12]$//; #remove directional tag if present | |
| 130 if (defined $ll_f1->{$name}) { | |
| 131 my $prev = $ll_f1->{$name}->{prev} // undef; | |
| 132 purge_cache( $ll_f1, $prev, $up1 ); | |
| 133 purge_cache( $ll_f2, $f2_prev, $up2 ); | |
| 134 print {$s2} $seq->as_fastq; | |
| 135 print {$s1} $ll_f1->{$name}->{read}; | |
| 136 delete $ll_f1->{$name}; | |
| 137 $f2_prev = undef; | |
| 138 } | |
| 139 else { | |
| 140 $ll_f2->{$name}->{read} = $seq->as_fastq; | |
| 141 $ll_f2->{$name}->{prev} = $f2_prev // undef; | |
| 142 $f2_prev = $name; | |
| 143 } | |
| 144 } | |
| 145 else { | |
| 146 $f2_open = 0; | |
| 147 } | |
| 148 } | |
| 149 | |
| 150 #handle remaining unpaired reads if necessary | |
| 151 if ($singles) { | |
| 152 purge_cache( $ll_f1, $f1_prev, $up1 ); | |
| 153 purge_cache( $ll_f2, $f2_prev, $up2 ); | |
| 154 close $up1; | |
| 155 close $up2; | |
| 156 } | |
| 157 | |
| 158 exit; | |
| 159 | |
| 160 sub purge_cache { | |
| 161 | |
| 162 my ($ll, $prev, $fh) = @_; | |
| 163 while (defined $prev && defined $ll->{$prev}) { | |
| 164 my $prev2 = $ll->{$prev}->{prev} // undef; | |
| 165 print {$fh} $ll->{$prev}->{read} if ($singles); | |
| 166 delete $ll->{$prev}; | |
| 167 $prev = $prev2; | |
| 168 } | |
| 169 | |
| 170 } | |
| 171 | |
| 172 | |
| 173 __END__ | |
| 174 | |
| 175 =head1 NAME | |
| 176 | |
| 177 sync_reads - resynchronize paired FASTQ files | |
| 178 | |
| 179 =head1 SYNOPSIS | |
| 180 | |
| 181 sync_reads [options] --fwd I<left_reads> --rev I<right reads> | |
| 182 | |
| 183 =head1 DESCRIPTION | |
| 184 | |
| 185 B<sync_reads> will re-synchronize two FASTQ files containing paired reads which | |
| 186 are no longer in sync due to individual removal of reads during pre-processing | |
| 187 (trimming, filtering, etc). In this case, "in sync" means that both files have | |
| 188 the same number of reads and, at any given read position in the files, the | |
| 189 corresponding reads represent proper pairs. The resulting files will contain | |
| 190 matching reads in order (assuming the input files were properly ordered). It | |
| 191 will optionally print out unpaired reads to separate files. Memory usage is | |
| 192 not dependent on the input file size but rather the maximum distance between | |
| 193 paired reads in the two files, as the read cache is flushed each time paired | |
| 194 reads are identified. In the worst-case scenario (one file has a single read | |
| 195 that pairs with the last read in the matching file) memory usage can approach | |
| 196 the largest file size, but in typical usage it rarely exceeds a few MB | |
| 197 regardless of file size. | |
| 198 | |
| 199 B<IMPORTANT:> Reads in input files MUST be in the same order, aside from | |
| 200 missing reads, or the output will report many valid pairs as singletons. | |
| 201 | |
| 202 =head1 OPTIONS | |
| 203 | |
| 204 =head2 Mandatory | |
| 205 | |
| 206 =over 4 | |
| 207 | |
| 208 =item B<--fwd> I<forward_fastq> | |
| 209 | |
| 210 Specify FASTQ file containing the first of the trimmed read pairs | |
| 211 | |
| 212 =item B<--rev> I<reverse_fastq> | |
| 213 | |
| 214 Specify FASTQ file containing the second of the trimmed read pairs | |
| 215 | |
| 216 =back | |
| 217 | |
| 218 =head2 Optional | |
| 219 | |
| 220 =over 4 | |
| 221 | |
| 222 =item B<--fwd_out> I<filename> | |
| 223 | |
| 224 Specify output name for synced forward reads | |
| 225 | |
| 226 =item B<--rev_out> I<filename> | |
| 227 | |
| 228 Specify output name for synced reverse reads | |
| 229 | |
| 230 =item B<--fwd_singles_out> I<filename> | |
| 231 | |
| 232 Specify output name for forward singleton reads | |
| 233 | |
| 234 =item B<--rev_singles_out> I<filename> | |
| 235 | |
| 236 Specify output name for reverse singleton reads | |
| 237 | |
| 238 =item B<--sync_suffix> I<suffix> | |
| 239 | |
| 240 Specify suffix to add to synced read output files. This will be added to the | |
| 241 input read name before the final suffix (i.e. after the last period). Default | |
| 242 is 'sync'. | |
| 243 | |
| 244 =item B<--compress> I<gzip|dsrc> | |
| 245 | |
| 246 Specify type of compression for output files (will compress all output files) | |
| 247 | |
| 248 =item B<--singles> | |
| 249 | |
| 250 If given, unpaired reads will be written to separate output files. Default is | |
| 251 FALSE. | |
| 252 | |
| 253 =item B<--singles_suffix> I<suffix> | |
| 254 | |
| 255 Specify suffix to add to singles read output files. This will be added to the | |
| 256 input read name before the final suffix (i.e. after the last period). Default | |
| 257 is 'singles'. | |
| 258 | |
| 259 =item B<--help> | |
| 260 | |
| 261 Display this usage page | |
| 262 | |
| 263 =item B<--version> | |
| 264 | |
| 265 Print version information | |
| 266 | |
| 267 =back | |
| 268 | |
| 269 =head1 CAVEATS AND BUGS | |
| 270 | |
| 271 Currently no input validation is performed on the input files. Files are | |
| 272 assumed to be standard FASTQ file format with each read represented by four | |
| 273 lines and no other extraneous information present. B<CRITICALLY>, they are also | |
| 274 assumed to be in the same input order after accounting for deleted reads | |
| 275 (the software will fail miserably if this is not the case). | |
| 276 | |
| 277 Please submit bug reports to the issue tracker in the distribution repository. | |
| 278 | |
| 279 =head1 AUTHOR | |
| 280 | |
| 281 Jeremy Volkening <jdv@base2bio.com> | |
| 282 | |
| 283 =head1 COPYRIGHT AND LICENSE | |
| 284 | |
| 285 Copyright 2014-16 Jeremy Volkening | |
| 286 | |
| 287 This program is free software: you can redistribute it and/or modify | |
| 288 it under the terms of the GNU General Public License as published by | |
| 289 the Free Software Foundation, either version 3 of the License, or | |
| 290 (at your option) any later version. | |
| 291 | |
| 292 This program is distributed in the hope that it will be useful, | |
| 293 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 294 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 295 GNU General Public License for more details. | |
| 296 | |
| 297 You should have received a copy of the GNU General Public License | |
| 298 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 299 | |
| 300 =cut |
