Mercurial > repos > jdv > b2b_summarize_assembly
comparison sync_reads @ 1:5def63878840 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author | jdv |
---|---|
date | Tue, 28 Sep 2021 06:16:07 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:114353d77370 | 1:5def63878840 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use 5.012; | |
6 | |
7 use autodie qw/open close/; | |
8 use BioX::Seq::Stream; | |
9 use Getopt::Long; | |
10 use Pod::Usage; | |
11 | |
12 my $reads1; | |
13 my $reads2; | |
14 my $singles; | |
15 my $sync_suffix = 'sync'; | |
16 my $singles_suffix = 'singles'; | |
17 my $compress = ''; # one of 'gzip' or 'dsrc' | |
18 my $outname1; | |
19 my $outname2; | |
20 my $singles1_name; | |
21 my $singles2_name; | |
22 | |
23 use constant DSRC_BIN => 'dsrc'; | |
24 use constant GZIP_BIN => 'gzip'; | |
25 use constant NAME => 'sync_reads'; | |
26 use constant VERSION => '0.005'; | |
27 | |
28 GetOptions( | |
29 | |
30 'fwd=s' => \$reads1, | |
31 'rev=s' => \$reads2, | |
32 'singles' => \$singles, | |
33 'fwd_out=s' => \$outname1, | |
34 'rev_out=s' => \$outname2, | |
35 'fwd_singles_out=s' => \$singles1_name, | |
36 'rev_singles_out=s' => \$singles2_name, | |
37 'sync_suffix=s' => \$sync_suffix, | |
38 'singles_suffix=s' => \$singles_suffix, | |
39 'compress:s' => \$compress, | |
40 'help' => sub{ pod2usage(-verbose => 2); }, | |
41 'version' => sub{ print 'This is ', NAME, ' v', VERSION, "\n";exit; }, | |
42 | |
43 ) or pod2usage( -verbose => 1 ); | |
44 | |
45 pod2usage(-verbose => 1, -msg => "Error: input files can\'t be read") | |
46 if (! -r $reads1 || ! -r $reads2); | |
47 | |
48 # open output filehandles, possibly with compression | |
49 if (! defined $outname1) { | |
50 $outname1 = $reads1; | |
51 $outname1 =~ s/([^\.]+)$/$sync_suffix\.$1/; | |
52 } | |
53 if (! defined $outname2) { | |
54 $outname2 = $reads2; | |
55 $outname2 =~ s/([^\.]+)$/$sync_suffix\.$1/; | |
56 } | |
57 my $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; | |
58 my $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' | |
59 : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' | |
60 : ''; | |
61 my $suffix = $compress eq 'gzip' ? '.gz' | |
62 : $compress eq 'dsrc' ? '.dsrc' | |
63 : ''; | |
64 open my $s1, $mode, $cmd . $outname1 . $suffix; | |
65 open my $s2, $mode, $cmd . $outname2 . $suffix; | |
66 | |
67 | |
68 # open singles output filehandles if requested, possibly with compression | |
69 my $up1; | |
70 my $up2; | |
71 if ($singles) { | |
72 if (! defined $singles1_name) { | |
73 $singles1_name = $reads1; | |
74 $singles1_name =~ s/([^\.]+)$/$singles_suffix\.$1/; | |
75 } | |
76 if (! defined $singles2_name) { | |
77 $singles2_name = $reads2; | |
78 $singles2_name =~ s/([^\.]+)$/$singles_suffix\.$1/; | |
79 } | |
80 $mode = $compress =~ /^(?:gzip|dsrc)$/ ? '|-' : '>'; | |
81 $cmd = $compress eq 'gzip' ? GZIP_BIN . ' -c > ' | |
82 : $compress eq 'dsrc' ? DSRC_BIN . ' c -m2 -s ' | |
83 : ''; | |
84 my $suffix = $compress eq 'gzip' ? '.gz' | |
85 : $compress eq 'dsrc' ? '.dsrc' | |
86 : ''; | |
87 open $up1, $mode, $cmd . $singles1_name . $suffix; | |
88 open $up2, $mode, $cmd . $singles2_name . $suffix; | |
89 } | |
90 | |
91 my $ll_f1 = {}; | |
92 my $ll_f2 = {}; | |
93 my $f1_prev; | |
94 my $f2_prev; | |
95 my $f1_open = 1; | |
96 my $f2_open = 1; | |
97 | |
98 my $parser1 = BioX::Seq::Stream->new($reads1); | |
99 my $parser2 = BioX::Seq::Stream->new($reads2); | |
100 | |
101 while ($f1_open || $f2_open) { | |
102 | |
103 # process next read for file 1 | |
104 if ($f1_open && defined (my $seq = $parser1->next_seq)) { | |
105 my $name = $seq->id; | |
106 $name =~ s/\/[12]$//; #remove directional tag if present | |
107 if (defined $ll_f2->{$name}) { | |
108 my $prev = $ll_f2->{$name}->{prev} // undef; | |
109 purge_cache( $ll_f2, $prev, $up2 ); | |
110 purge_cache( $ll_f1, $f1_prev, $up1 ); | |
111 print {$s1} $seq->as_fastq; | |
112 print {$s2} $ll_f2->{$name}->{read}; | |
113 delete $ll_f2->{$name}; | |
114 $f1_prev = undef; | |
115 } | |
116 else { | |
117 $ll_f1->{$name}->{read} = $seq->as_fastq; | |
118 $ll_f1->{$name}->{prev} = $f1_prev // undef; | |
119 $f1_prev = $name; | |
120 } | |
121 } | |
122 else { | |
123 $f1_open = 0; | |
124 } | |
125 | |
126 # process next read for file 2 | |
127 if ($f2_open && defined (my $seq = $parser2->next_seq)) { | |
128 my $name = $seq->id; | |
129 $name =~ s/\/[12]$//; #remove directional tag if present | |
130 if (defined $ll_f1->{$name}) { | |
131 my $prev = $ll_f1->{$name}->{prev} // undef; | |
132 purge_cache( $ll_f1, $prev, $up1 ); | |
133 purge_cache( $ll_f2, $f2_prev, $up2 ); | |
134 print {$s2} $seq->as_fastq; | |
135 print {$s1} $ll_f1->{$name}->{read}; | |
136 delete $ll_f1->{$name}; | |
137 $f2_prev = undef; | |
138 } | |
139 else { | |
140 $ll_f2->{$name}->{read} = $seq->as_fastq; | |
141 $ll_f2->{$name}->{prev} = $f2_prev // undef; | |
142 $f2_prev = $name; | |
143 } | |
144 } | |
145 else { | |
146 $f2_open = 0; | |
147 } | |
148 } | |
149 | |
150 #handle remaining unpaired reads if necessary | |
151 if ($singles) { | |
152 purge_cache( $ll_f1, $f1_prev, $up1 ); | |
153 purge_cache( $ll_f2, $f2_prev, $up2 ); | |
154 close $up1; | |
155 close $up2; | |
156 } | |
157 | |
158 exit; | |
159 | |
160 sub purge_cache { | |
161 | |
162 my ($ll, $prev, $fh) = @_; | |
163 while (defined $prev && defined $ll->{$prev}) { | |
164 my $prev2 = $ll->{$prev}->{prev} // undef; | |
165 print {$fh} $ll->{$prev}->{read} if ($singles); | |
166 delete $ll->{$prev}; | |
167 $prev = $prev2; | |
168 } | |
169 | |
170 } | |
171 | |
172 | |
173 __END__ | |
174 | |
175 =head1 NAME | |
176 | |
177 sync_reads - resynchronize paired FASTQ files | |
178 | |
179 =head1 SYNOPSIS | |
180 | |
181 sync_reads [options] --fwd I<left_reads> --rev I<right reads> | |
182 | |
183 =head1 DESCRIPTION | |
184 | |
185 B<sync_reads> will re-synchronize two FASTQ files containing paired reads which | |
186 are no longer in sync due to individual removal of reads during pre-processing | |
187 (trimming, filtering, etc). In this case, "in sync" means that both files have | |
188 the same number of reads and, at any given read position in the files, the | |
189 corresponding reads represent proper pairs. The resulting files will contain | |
190 matching reads in order (assuming the input files were properly ordered). It | |
191 will optionally print out unpaired reads to separate files. Memory usage is | |
192 not dependent on the input file size but rather the maximum distance between | |
193 paired reads in the two files, as the read cache is flushed each time paired | |
194 reads are identified. In the worst-case scenario (one file has a single read | |
195 that pairs with the last read in the matching file) memory usage can approach | |
196 the largest file size, but in typical usage it rarely exceeds a few MB | |
197 regardless of file size. | |
198 | |
199 B<IMPORTANT:> Reads in input files MUST be in the same order, aside from | |
200 missing reads, or the output will report many valid pairs as singletons. | |
201 | |
202 =head1 OPTIONS | |
203 | |
204 =head2 Mandatory | |
205 | |
206 =over 4 | |
207 | |
208 =item B<--fwd> I<forward_fastq> | |
209 | |
210 Specify FASTQ file containing the first of the trimmed read pairs | |
211 | |
212 =item B<--rev> I<reverse_fastq> | |
213 | |
214 Specify FASTQ file containing the second of the trimmed read pairs | |
215 | |
216 =back | |
217 | |
218 =head2 Optional | |
219 | |
220 =over 4 | |
221 | |
222 =item B<--fwd_out> I<filename> | |
223 | |
224 Specify output name for synced forward reads | |
225 | |
226 =item B<--rev_out> I<filename> | |
227 | |
228 Specify output name for synced reverse reads | |
229 | |
230 =item B<--fwd_singles_out> I<filename> | |
231 | |
232 Specify output name for forward singleton reads | |
233 | |
234 =item B<--rev_singles_out> I<filename> | |
235 | |
236 Specify output name for reverse singleton reads | |
237 | |
238 =item B<--sync_suffix> I<suffix> | |
239 | |
240 Specify suffix to add to synced read output files. This will be added to the | |
241 input read name before the final suffix (i.e. after the last period). Default | |
242 is 'sync'. | |
243 | |
244 =item B<--compress> I<gzip|dsrc> | |
245 | |
246 Specify type of compression for output files (will compress all output files) | |
247 | |
248 =item B<--singles> | |
249 | |
250 If given, unpaired reads will be written to separate output files. Default is | |
251 FALSE. | |
252 | |
253 =item B<--singles_suffix> I<suffix> | |
254 | |
255 Specify suffix to add to singles read output files. This will be added to the | |
256 input read name before the final suffix (i.e. after the last period). Default | |
257 is 'singles'. | |
258 | |
259 =item B<--help> | |
260 | |
261 Display this usage page | |
262 | |
263 =item B<--version> | |
264 | |
265 Print version information | |
266 | |
267 =back | |
268 | |
269 =head1 CAVEATS AND BUGS | |
270 | |
271 Currently no input validation is performed on the input files. Files are | |
272 assumed to be standard FASTQ file format with each read represented by four | |
273 lines and no other extraneous information present. B<CRITICALLY>, they are also | |
274 assumed to be in the same input order after accounting for deleted reads | |
275 (the software will fail miserably if this is not the case). | |
276 | |
277 Please submit bug reports to the issue tracker in the distribution repository. | |
278 | |
279 =head1 AUTHOR | |
280 | |
281 Jeremy Volkening <jdv@base2bio.com> | |
282 | |
283 =head1 COPYRIGHT AND LICENSE | |
284 | |
285 Copyright 2014-16 Jeremy Volkening | |
286 | |
287 This program is free software: you can redistribute it and/or modify | |
288 it under the terms of the GNU General Public License as published by | |
289 the Free Software Foundation, either version 3 of the License, or | |
290 (at your option) any later version. | |
291 | |
292 This program is distributed in the hope that it will be useful, | |
293 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
294 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
295 GNU General Public License for more details. | |
296 | |
297 You should have received a copy of the GNU General Public License | |
298 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
299 | |
300 =cut |