Mercurial > repos > jdv > albacore
diff denoise.pl @ 1:0a4f83207e53 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/albacore commit 4aa7a76a7b29c425dd89a020979e835d785d3c95-dirty
author | jdv |
---|---|
date | Wed, 06 Sep 2017 12:12:52 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/denoise.pl Wed Sep 06 12:12:52 2017 -0400 @@ -0,0 +1,136 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use File::Basename qw/basename/; +use File::Copy qw/copy/; +use Getopt::Long; +use List::Util qw/sum/; + +my $fn_table; +my @reads; +my @names; +my $min_score = 80; +my $min_frac = 0.05; +my $rm_unclass = 0; +my $n_keep; +my $fn_summary; + +use constant BARCODE => 14; +use constant SCORE => 15; + +my $unclass_tag = 'unclassified'; + +GetOptions( + 'input=s' => \@reads, + 'name=s' => \@names, + 'table=s' => \$fn_table, + 'min_score=f' => \$min_score, + 'min_frac=f' => \$min_frac, + 'remove_unclassified' => \$rm_unclass, + 'n_keep=i' => \$n_keep, + 'summary=s' => \$fn_summary, +); + +die "Table not found\n" + if (! -r $fn_table); + +open my $tsv, '<', $fn_table; +my $head = <$tsv>; +chomp $head; +my @fields = split "\t", $head; + +die "unexpected field order" + if ($fields[BARCODE] ne 'barcode_arrangement'); +die "unexpected field order" + if ($fields[SCORE] ne 'barcode_score'); + +my %counts; +my %sums; + +while (my $line = <$tsv>) { + chomp $line; + my @fields = split "\t", $line; + my $bin = $fields[BARCODE]; + my $score = $fields[SCORE]; + $counts{$bin} += 1; + $sums{$bin} += $score; +} + +if ($rm_unclass) { + delete $counts{$unclass_tag}; + delete $sums{$unclass_tag}; +} + +my @keys = sort {$counts{$b} <=> $counts{$a}} keys %counts; + +my %scores; + +my %status; + +@status{ @keys } = ('discarded') x scalar(@keys); + +for (@keys) { + $scores{$_} = $sums{$_}/$counts{$_}; +} +my $sum_count = sum(values %counts); +my $mean_count = $sum_count / scalar(values %counts); + +if (defined $n_keep) { + + my %rank_scores; + @rank_scores{ @keys } = map {$scores{$_} * $counts{$_}/$sum_count} @keys; + + @keys = sort {$rank_scores{$b} <=> $rank_scores{$a}} keys %rank_scores; + + @keys = @keys[0..$n_keep-1]; + +} + +else { + + @keys = grep {$scores{$_} >= $min_score} @keys; + @keys = grep {$counts{$_} >= $mean_count*$min_frac} @keys; + +} + +@status{ @keys } = ('kept') x scalar(@keys); + +@keys = sort {$counts{$b} <=> $counts{$a}} keys %status; + +# print summary table +open my $summary, '>', $fn_summary + or die "Failed to open summary: $!\n"; +say {$summary} join "\t", + '#bin', + 'n_reads', + 'avg_score', + 'status', +; +for (@keys) { + say {$summary} join "\t", + $_, + $counts{$_}, + sprintf("%0.3f",$scores{$_}), + $status{$_}, + ; +} +close $summary; + +mkdir('outputs'); + +for my $i (0..$#reads) { + + my $fn = $reads[$i]; + my $name = $names[$i]; + + my $base = basename($name); + my $bin = $base; + $bin =~ s/\.(fastq|fq|fast5\.tar\.gz)$//i; + if (defined $status{$bin} && $status{$bin} eq 'kept') { + copy( $fn, "outputs/$base" ); + } + +}