comparison denoise.pl @ 1:0a4f83207e53 draft

planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/albacore commit 4aa7a76a7b29c425dd89a020979e835d785d3c95-dirty
author jdv
date Wed, 06 Sep 2017 12:12:52 -0400
parents
children
comparison
equal deleted inserted replaced
0:f8e25d69167d 1:0a4f83207e53
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5 use 5.012;
6
7 use File::Basename qw/basename/;
8 use File::Copy qw/copy/;
9 use Getopt::Long;
10 use List::Util qw/sum/;
11
12 my $fn_table;
13 my @reads;
14 my @names;
15 my $min_score = 80;
16 my $min_frac = 0.05;
17 my $rm_unclass = 0;
18 my $n_keep;
19 my $fn_summary;
20
21 use constant BARCODE => 14;
22 use constant SCORE => 15;
23
24 my $unclass_tag = 'unclassified';
25
26 GetOptions(
27 'input=s' => \@reads,
28 'name=s' => \@names,
29 'table=s' => \$fn_table,
30 'min_score=f' => \$min_score,
31 'min_frac=f' => \$min_frac,
32 'remove_unclassified' => \$rm_unclass,
33 'n_keep=i' => \$n_keep,
34 'summary=s' => \$fn_summary,
35 );
36
37 die "Table not found\n"
38 if (! -r $fn_table);
39
40 open my $tsv, '<', $fn_table;
41 my $head = <$tsv>;
42 chomp $head;
43 my @fields = split "\t", $head;
44
45 die "unexpected field order"
46 if ($fields[BARCODE] ne 'barcode_arrangement');
47 die "unexpected field order"
48 if ($fields[SCORE] ne 'barcode_score');
49
50 my %counts;
51 my %sums;
52
53 while (my $line = <$tsv>) {
54 chomp $line;
55 my @fields = split "\t", $line;
56 my $bin = $fields[BARCODE];
57 my $score = $fields[SCORE];
58 $counts{$bin} += 1;
59 $sums{$bin} += $score;
60 }
61
62 if ($rm_unclass) {
63 delete $counts{$unclass_tag};
64 delete $sums{$unclass_tag};
65 }
66
67 my @keys = sort {$counts{$b} <=> $counts{$a}} keys %counts;
68
69 my %scores;
70
71 my %status;
72
73 @status{ @keys } = ('discarded') x scalar(@keys);
74
75 for (@keys) {
76 $scores{$_} = $sums{$_}/$counts{$_};
77 }
78 my $sum_count = sum(values %counts);
79 my $mean_count = $sum_count / scalar(values %counts);
80
81 if (defined $n_keep) {
82
83 my %rank_scores;
84 @rank_scores{ @keys } = map {$scores{$_} * $counts{$_}/$sum_count} @keys;
85
86 @keys = sort {$rank_scores{$b} <=> $rank_scores{$a}} keys %rank_scores;
87
88 @keys = @keys[0..$n_keep-1];
89
90 }
91
92 else {
93
94 @keys = grep {$scores{$_} >= $min_score} @keys;
95 @keys = grep {$counts{$_} >= $mean_count*$min_frac} @keys;
96
97 }
98
99 @status{ @keys } = ('kept') x scalar(@keys);
100
101 @keys = sort {$counts{$b} <=> $counts{$a}} keys %status;
102
103 # print summary table
104 open my $summary, '>', $fn_summary
105 or die "Failed to open summary: $!\n";
106 say {$summary} join "\t",
107 '#bin',
108 'n_reads',
109 'avg_score',
110 'status',
111 ;
112 for (@keys) {
113 say {$summary} join "\t",
114 $_,
115 $counts{$_},
116 sprintf("%0.3f",$scores{$_}),
117 $status{$_},
118 ;
119 }
120 close $summary;
121
122 mkdir('outputs');
123
124 for my $i (0..$#reads) {
125
126 my $fn = $reads[$i];
127 my $name = $names[$i];
128
129 my $base = basename($name);
130 my $bin = $base;
131 $bin =~ s/\.(fastq|fq|fast5\.tar\.gz)$//i;
132 if (defined $status{$bin} && $status{$bin} eq 'kept') {
133 copy( $fn, "outputs/$base" );
134 }
135
136 }