comparison summarize_run.pl @ 0:6466424fc8ac draft

planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit b5d52d0664b01d252cf61b98be373d09f1ecc2df
author jdv
date Wed, 17 Jul 2019 17:48:50 -0400
parents
children b7f66945bf72
comparison
equal deleted inserted replaced
-1:000000000000 0:6466424fc8ac
1 #!/usr/bin/perl
2
3 use strict;
4 use warnings;
5
6 use Getopt::Long;
7 use BioX::Seq::Stream;
8 use List::Util qw/sum/;
9
10 my $fn_raw1;
11 my $fn_raw2;
12 my $fn_filt1;
13 my $fn_filt2;
14 my $fn_bedgraph;
15 my $fn_qc1;
16 my $fn_qc2;
17 my $fn_consensus;
18 my $fn_out;
19 my $n_threads = 1;
20 my $max_aln = 100000;
21
22 GetOptions(
23 'raw_1=s' => \$fn_raw1,
24 'raw_2=s' => \$fn_raw2,
25 'filt_1=s' => \$fn_filt1,
26 'filt_2=s' => \$fn_filt2,
27 'bedgraph=s' => \$fn_bedgraph,
28 'fastqc_1=s' => \$fn_qc1,
29 'fastqc_2=s' => \$fn_qc2,
30 'consensus=s' => \$fn_consensus,
31 'out=s' => \$fn_out,
32 'threads=i' => \$n_threads,
33 'max_aln=i' => \$max_aln,
34 );
35
36
37 my @counts;
38 for ($fn_raw1, $fn_raw2, $fn_filt1, $fn_filt2) {
39 open my $in, '-|', 'wc', '-l', $_;
40 my $ret = <$in>;
41 close $in;
42 chomp $ret;
43 my ($count, $fn) = split ' ', $ret;
44 die "line length not multiple of four for $_\n"
45 if ($count % 4);
46 push @counts, $count/4;
47 }
48
49
50 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]);
51 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]);
52
53 #warn "calculating fragment length stats...\n";
54 my @lens;
55 open my $stream, '-|', "frag_lens","--forward",$fn_filt1,"--reverse",$fn_filt2,"--ref",$fn_consensus,"--threads",$n_threads,"--max_aln",$max_aln;
56 while (<$stream>) {
57 chomp $_;
58 push @lens, $_;
59 }
60 close $stream;
61
62 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 );
63 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 );
64
65 # extract FastQC data
66 #warn "extracting FastQC stats...\n";
67
68 my @five_nums;
69 for my $fn ($fn_qc1, $fn_qc2) {
70 open my $in, '<', $fn;
71
72 my $in_data = 0;
73 my @data;
74 LINE:
75 while (my $line = <$in>) {
76 chomp $line;
77 if ($in_data) {
78 if ($line =~ /^>>END_MODULE/) {
79 last LINE;
80 }
81 next if ($line =~ /^#/);
82 my ($score, $count) = split ' ', $line;
83 push @data, [$score,$count];
84 }
85 elsif ($line =~ /^>>Per sequence quality scores/) {
86 $in_data = 1;
87 }
88 }
89
90 push @five_nums, data_to_5( @data );
91 }
92
93 # Count contigs
94
95 my $p = BioX::Seq::Stream->new($fn_consensus);
96 my %n_contigs;
97 my @names;
98 while (my $seq = $p->next_seq) {
99
100 my $id = $seq->id;
101 push @names, $id;
102 while ($seq =~ /[^Nn]+/g) {
103 ++$n_contigs{$id};
104 }
105 }
106
107 # Parse assembly depth info
108 #warn "calculating coverage stats...\n";
109
110 open my $in, '<', $fn_bedgraph;
111
112 my %cov_5nums;
113 my %counts;
114 my $last_end;
115 my $last_contig;
116 my $head = <$in>;
117 while (my $line = <$in>) {
118 chomp $line;
119 my ($contig,$start,$end,$depth) = split "\t", $line;
120 $last_contig //= $contig;
121 if ($contig ne $last_contig) {
122
123 my @depths = sort {$a <=> $b} keys %counts;
124 my @data;
125 for (@depths) {
126 push @data, [$_, $counts{$_}];
127 }
128 $cov_5nums{$last_contig} = data_to_5( @data );
129 $last_contig = $contig;
130 %counts = ();
131 $last_end = undef;
132 }
133
134 if (defined($last_end) && $last_end < $start) {
135 $counts{0} += $start - $last_end;
136 }
137 $counts{$depth} += $end - $start;
138 $last_end = $end;
139 }
140 my @depths = sort {$a <=> $b} keys %counts;
141 my @data;
142 for (@depths) {
143 push @data, [$_, $counts{$_}];
144 }
145 $cov_5nums{$last_contig} = data_to_5( @data );
146
147 open my $out, '>', $fn_out;
148 print {$out} join("\t",
149 '#id',
150 'raw_read_pairs',
151 'filt_read_pairs',
152 'frag_len_mean',
153 'frag_len_sd',
154 'forward_qual',
155 'reverse_qual',
156 'n_contigs',
157 'coverage_depth',
158 ), "\n";
159 for my $id (@names) {
160 print {$out} join("\t",
161 $id,
162 $counts[0],
163 $counts[2],
164 $frag_mean,
165 $frag_sd,
166 $five_nums[0],
167 $five_nums[1],
168 $n_contigs{$id},
169 $cov_5nums{$id},
170 ), "\n";
171 }
172 close $out;
173
174 exit;
175
176
177 sub data_to_5 {
178
179 my (@data) = @_;
180 my $total = sum map {$_->[1]} @data;
181 my @five_num;
182 my $curr = 0;
183 for my $i (0..$#data) {
184 $curr += $data[$i]->[1];
185 for my $j (0..4) {
186 next if (defined $five_num[$j]);
187 my $quant = $j/4;
188 if ($curr/$total > $quant) {
189 $five_num[$j] = $data[$i]->[0];
190 }
191 elsif ($curr/$total == $quant) {
192 $five_num[$j] = $i < $#data
193 ? int(($data[$i]->[0]+$data[$i+1]->[0])/2)
194 : $data[$i]->[0];
195 }
196 }
197 }
198 return join('|',@five_num);
199
200 }