Mercurial > repos > jdv > b2b_bam2consensus
comparison summarize_assembly @ 1:2367d00c5182 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author | jdv |
---|---|
date | Tue, 28 Sep 2021 06:12:40 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d32139014ec3 | 1:2367d00c5182 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use 5.012; | |
6 | |
7 use Getopt::Long; | |
8 use Pod::Usage; | |
9 use List::Util qw/sum max min/; | |
10 use List::MoreUtils qw/uniq/; | |
11 use BioX::Seq::Stream; | |
12 | |
13 my $PROGRAM = 'summarize_assembly'; | |
14 my $VERSION = 0.002; | |
15 | |
16 my $fn_fasta; | |
17 my @cutoffs; | |
18 my $strip = 0; | |
19 my $split = 0; | |
20 | |
21 # Collect command-line parameters | |
22 my $err_msg = 'Syntax error: please check your syntax'; | |
23 GetOptions( | |
24 'fasta=s' => \$fn_fasta, | |
25 'cutoffs:i{,}' => \@cutoffs, | |
26 'strip_N' => \$strip, | |
27 'split_N' => \$split, | |
28 'help' => sub{ pod2usage( -verbose => 2 ) }, | |
29 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; }, | |
30 ) or pod2usage( -msg => $err_msg, -verbose => 1 ); | |
31 | |
32 # Set default cutoffs if necessary and sort | |
33 if (! scalar @cutoffs) { | |
34 warn "No cutoff supplied, defaulting to N50\n"; | |
35 push @cutoffs, 50; | |
36 } | |
37 @cutoffs = sort {$a <=> $b} uniq @cutoffs; | |
38 | |
39 # Only one of 'strip_N' or 'split_N' is valid | |
40 if ($strip && $split) { | |
41 warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n"; | |
42 $strip = 0; | |
43 } | |
44 | |
45 # Check for a few necessary conditions | |
46 die "Can't open FASTA file for reading" | |
47 if (defined $fn_fasta && ! -r $fn_fasta); | |
48 die "One or more cutoffs outside valid range (1-99)" | |
49 if (grep {$_ < 1 || $_ > 99} @cutoffs); | |
50 die "Cutoffs must be integer values" | |
51 if (grep {$_ ne int($_)} @cutoffs); | |
52 my @lens; | |
53 my $N_sum = 0; | |
54 my $GC_sum = 0; | |
55 | |
56 # Read in sequences and calculate descriptive stats | |
57 my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined | |
58 | |
59 SEQ: | |
60 while (my $seq = $stream->next_seq) { | |
61 my @parts = ($seq); | |
62 @parts = split(/n+/i, $seq) if $split; | |
63 for (@parts) { | |
64 my $Ns = ($_ =~ tr/Nn//); | |
65 $N_sum += $Ns; | |
66 $GC_sum += ($_ =~ tr/GCgc//); | |
67 push @lens, length($_) - $Ns * $strip; | |
68 } | |
69 } | |
70 @lens = sort {$b <=> $a} @lens; | |
71 | |
72 # Calculate basic stats | |
73 my $scaffold_count = scalar @lens; | |
74 my $total_len = sum @lens; | |
75 my $N_fraction = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100; | |
76 my $max_length = max @lens; | |
77 my $min_length = min @lens; | |
78 my $mean_length = round($total_len/$scaffold_count, 0); | |
79 | |
80 # GC percentage calculated from non-ambiguous bases only | |
81 my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100; | |
82 | |
83 # Calculate Nx (N50, N80, etc) values | |
84 # For example, N50 is the size of the smallest contig for which it and all | |
85 # larger contigs contain 50% of the total nucleotides in the database | |
86 my $cum_length = 0; | |
87 my @fractions = map {$_/100} @cutoffs; | |
88 my @Nx; | |
89 | |
90 LEN: | |
91 for (@lens) { | |
92 $cum_length += $_; | |
93 if ($cum_length/$total_len >= $fractions[0]) { | |
94 push @Nx, $_; | |
95 shift @fractions; | |
96 last LEN if (@fractions < 1); | |
97 } | |
98 } | |
99 | |
100 # Print summary | |
101 print '-' x 40 . "\n" | |
102 . "Summary\n" | |
103 . '-' x 40 . "\n" | |
104 . "number of scaffolds: $scaffold_count\n" | |
105 . "total length: $total_len\n" | |
106 . "average length: $mean_length\n" | |
107 . "G/C content: $GC_fraction\%\n" | |
108 . "ambiguous content: $N_fraction\%\n" | |
109 . "longest scaffold: $max_length\n"; | |
110 for (0..$#Nx) { | |
111 my $label = sprintf "N%02d", $cutoffs[$_]; | |
112 print "$label: $Nx[$_]\n"; | |
113 } | |
114 print "shortest scaffold: $min_length\n"; | |
115 print "NOTE: Ns were stripped for above calculations\n" if ($strip); | |
116 print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split); | |
117 print '-' x 40 . "\n"; | |
118 | |
119 exit; | |
120 | |
121 sub round { | |
122 | |
123 my ($val,$places) = @_; | |
124 return int($val*10**$places+0.5)/10**$places; | |
125 | |
126 } | |
127 | |
128 | |
129 __END__ | |
130 | |
131 =head1 NAME | |
132 | |
133 summarize_assembly - print basic summary info for a file of assembly scaffolds | |
134 | |
135 =head1 SYNOPSIS | |
136 | |
137 summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>] | |
138 | |
139 =head1 DESCRIPTION | |
140 | |
141 This program takes a FASTA file and optionally a list of cutoff values as | |
142 input and prints out summary information about the contigs/scaffolds contained | |
143 in the file. You can, of course, supply a FASTA file of any sort of nucleic | |
144 acid sequences, but the summary information makes most sense for contigs from | |
145 genomic sequencing assemblies. | |
146 | |
147 =head1 OPTIONS | |
148 | |
149 =over | |
150 | |
151 =item B<--fasta> I<filename> | |
152 | |
153 Specify contig/scaffold file from which to read input (default: STDIN) | |
154 | |
155 =item B<--cutoffs> | |
156 | |
157 Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90' | |
158 will output N50 and N90 values) (default: 50) | |
159 | |
160 =item B<--strip_N> | |
161 | |
162 If specified, Ns will be stripped from scaffold sequences before statistics | |
163 are calculated (default: FALSE) | |
164 | |
165 =item B<--split_N> | |
166 | |
167 If specified, scaffold sequences will be split at regions of one or more Ns | |
168 before statistics are calculated (e.g. to get contig-level stats from a | |
169 scaffold file). Note that if this flag is specified, the value of '--strip_N' | |
170 will be ignored. (default: FALSE) | |
171 | |
172 =item B<--help> | |
173 | |
174 Display this usage page | |
175 | |
176 =item B<--version> | |
177 | |
178 Print version information | |
179 | |
180 =back | |
181 | |
182 =head1 CAVEATS AND BUGS | |
183 | |
184 Please submit bug reports to the issue tracker in the distribution repository. | |
185 | |
186 =head1 AUTHOR | |
187 | |
188 Jeremy Volkening (jdv@base2bio.com) | |
189 | |
190 =head1 LICENSE AND COPYRIGHT | |
191 | |
192 Copyright 2014-16 Jeremy Volkening | |
193 | |
194 This program is free software: you can redistribute it and/or modify | |
195 it under the terms of the GNU General Public License as published by | |
196 the Free Software Foundation, either version 3 of the License, or | |
197 (at your option) any later version. | |
198 | |
199 This program is distributed in the hope that it will be useful, | |
200 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
201 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
202 GNU General Public License for more details. | |
203 | |
204 You should have received a copy of the GNU General Public License | |
205 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
206 | |
207 =cut | |
208 |