Mercurial > repos > jdv > b2b_sync_reads
comparison summarize_run.pl @ 0:6466424fc8ac draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit b5d52d0664b01d252cf61b98be373d09f1ecc2df
author | jdv |
---|---|
date | Wed, 17 Jul 2019 17:48:50 -0400 |
parents | |
children | b7f66945bf72 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6466424fc8ac |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 use Getopt::Long; | |
7 use BioX::Seq::Stream; | |
8 use List::Util qw/sum/; | |
9 | |
10 my $fn_raw1; | |
11 my $fn_raw2; | |
12 my $fn_filt1; | |
13 my $fn_filt2; | |
14 my $fn_bedgraph; | |
15 my $fn_qc1; | |
16 my $fn_qc2; | |
17 my $fn_consensus; | |
18 my $fn_out; | |
19 my $n_threads = 1; | |
20 my $max_aln = 100000; | |
21 | |
22 GetOptions( | |
23 'raw_1=s' => \$fn_raw1, | |
24 'raw_2=s' => \$fn_raw2, | |
25 'filt_1=s' => \$fn_filt1, | |
26 'filt_2=s' => \$fn_filt2, | |
27 'bedgraph=s' => \$fn_bedgraph, | |
28 'fastqc_1=s' => \$fn_qc1, | |
29 'fastqc_2=s' => \$fn_qc2, | |
30 'consensus=s' => \$fn_consensus, | |
31 'out=s' => \$fn_out, | |
32 'threads=i' => \$n_threads, | |
33 'max_aln=i' => \$max_aln, | |
34 ); | |
35 | |
36 | |
37 my @counts; | |
38 for ($fn_raw1, $fn_raw2, $fn_filt1, $fn_filt2) { | |
39 open my $in, '-|', 'wc', '-l', $_; | |
40 my $ret = <$in>; | |
41 close $in; | |
42 chomp $ret; | |
43 my ($count, $fn) = split ' ', $ret; | |
44 die "line length not multiple of four for $_\n" | |
45 if ($count % 4); | |
46 push @counts, $count/4; | |
47 } | |
48 | |
49 | |
50 die "raw pair count mismatch\n" if ($counts[0] != $counts[1]); | |
51 die "filtered pair count mismatch\n" if ($counts[2] != $counts[3]); | |
52 | |
53 #warn "calculating fragment length stats...\n"; | |
54 my @lens; | |
55 open my $stream, '-|', "frag_lens","--forward",$fn_filt1,"--reverse",$fn_filt2,"--ref",$fn_consensus,"--threads",$n_threads,"--max_aln",$max_aln; | |
56 while (<$stream>) { | |
57 chomp $_; | |
58 push @lens, $_; | |
59 } | |
60 close $stream; | |
61 | |
62 my $frag_mean = int( sum(@lens)/scalar(@lens)+0.5 ); | |
63 my $frag_sd = int( sqrt( sum( map {($_ - $frag_mean)**2} @lens)/(scalar(@lens)-1) )+0.5 ); | |
64 | |
65 # extract FastQC data | |
66 #warn "extracting FastQC stats...\n"; | |
67 | |
68 my @five_nums; | |
69 for my $fn ($fn_qc1, $fn_qc2) { | |
70 open my $in, '<', $fn; | |
71 | |
72 my $in_data = 0; | |
73 my @data; | |
74 LINE: | |
75 while (my $line = <$in>) { | |
76 chomp $line; | |
77 if ($in_data) { | |
78 if ($line =~ /^>>END_MODULE/) { | |
79 last LINE; | |
80 } | |
81 next if ($line =~ /^#/); | |
82 my ($score, $count) = split ' ', $line; | |
83 push @data, [$score,$count]; | |
84 } | |
85 elsif ($line =~ /^>>Per sequence quality scores/) { | |
86 $in_data = 1; | |
87 } | |
88 } | |
89 | |
90 push @five_nums, data_to_5( @data ); | |
91 } | |
92 | |
93 # Count contigs | |
94 | |
95 my $p = BioX::Seq::Stream->new($fn_consensus); | |
96 my %n_contigs; | |
97 my @names; | |
98 while (my $seq = $p->next_seq) { | |
99 | |
100 my $id = $seq->id; | |
101 push @names, $id; | |
102 while ($seq =~ /[^Nn]+/g) { | |
103 ++$n_contigs{$id}; | |
104 } | |
105 } | |
106 | |
107 # Parse assembly depth info | |
108 #warn "calculating coverage stats...\n"; | |
109 | |
110 open my $in, '<', $fn_bedgraph; | |
111 | |
112 my %cov_5nums; | |
113 my %counts; | |
114 my $last_end; | |
115 my $last_contig; | |
116 my $head = <$in>; | |
117 while (my $line = <$in>) { | |
118 chomp $line; | |
119 my ($contig,$start,$end,$depth) = split "\t", $line; | |
120 $last_contig //= $contig; | |
121 if ($contig ne $last_contig) { | |
122 | |
123 my @depths = sort {$a <=> $b} keys %counts; | |
124 my @data; | |
125 for (@depths) { | |
126 push @data, [$_, $counts{$_}]; | |
127 } | |
128 $cov_5nums{$last_contig} = data_to_5( @data ); | |
129 $last_contig = $contig; | |
130 %counts = (); | |
131 $last_end = undef; | |
132 } | |
133 | |
134 if (defined($last_end) && $last_end < $start) { | |
135 $counts{0} += $start - $last_end; | |
136 } | |
137 $counts{$depth} += $end - $start; | |
138 $last_end = $end; | |
139 } | |
140 my @depths = sort {$a <=> $b} keys %counts; | |
141 my @data; | |
142 for (@depths) { | |
143 push @data, [$_, $counts{$_}]; | |
144 } | |
145 $cov_5nums{$last_contig} = data_to_5( @data ); | |
146 | |
147 open my $out, '>', $fn_out; | |
148 print {$out} join("\t", | |
149 '#id', | |
150 'raw_read_pairs', | |
151 'filt_read_pairs', | |
152 'frag_len_mean', | |
153 'frag_len_sd', | |
154 'forward_qual', | |
155 'reverse_qual', | |
156 'n_contigs', | |
157 'coverage_depth', | |
158 ), "\n"; | |
159 for my $id (@names) { | |
160 print {$out} join("\t", | |
161 $id, | |
162 $counts[0], | |
163 $counts[2], | |
164 $frag_mean, | |
165 $frag_sd, | |
166 $five_nums[0], | |
167 $five_nums[1], | |
168 $n_contigs{$id}, | |
169 $cov_5nums{$id}, | |
170 ), "\n"; | |
171 } | |
172 close $out; | |
173 | |
174 exit; | |
175 | |
176 | |
177 sub data_to_5 { | |
178 | |
179 my (@data) = @_; | |
180 my $total = sum map {$_->[1]} @data; | |
181 my @five_num; | |
182 my $curr = 0; | |
183 for my $i (0..$#data) { | |
184 $curr += $data[$i]->[1]; | |
185 for my $j (0..4) { | |
186 next if (defined $five_num[$j]); | |
187 my $quant = $j/4; | |
188 if ($curr/$total > $quant) { | |
189 $five_num[$j] = $data[$i]->[0]; | |
190 } | |
191 elsif ($curr/$total == $quant) { | |
192 $five_num[$j] = $i < $#data | |
193 ? int(($data[$i]->[0]+$data[$i+1]->[0])/2) | |
194 : $data[$i]->[0]; | |
195 } | |
196 } | |
197 } | |
198 return join('|',@five_num); | |
199 | |
200 } |