comparison summarize_assembly @ 1:b7f66945bf72 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author jdv
date Tue, 28 Sep 2021 06:14:59 +0000
parents
children
comparison
equal deleted inserted replaced
0:6466424fc8ac 1:b7f66945bf72
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5 use 5.012;
6
7 use Getopt::Long;
8 use Pod::Usage;
9 use List::Util qw/sum max min/;
10 use List::MoreUtils qw/uniq/;
11 use BioX::Seq::Stream;
12
13 my $PROGRAM = 'summarize_assembly';
14 my $VERSION = 0.002;
15
16 my $fn_fasta;
17 my @cutoffs;
18 my $strip = 0;
19 my $split = 0;
20
21 # Collect command-line parameters
22 my $err_msg = 'Syntax error: please check your syntax';
23 GetOptions(
24 'fasta=s' => \$fn_fasta,
25 'cutoffs:i{,}' => \@cutoffs,
26 'strip_N' => \$strip,
27 'split_N' => \$split,
28 'help' => sub{ pod2usage( -verbose => 2 ) },
29 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
30 ) or pod2usage( -msg => $err_msg, -verbose => 1 );
31
32 # Set default cutoffs if necessary and sort
33 if (! scalar @cutoffs) {
34 warn "No cutoff supplied, defaulting to N50\n";
35 push @cutoffs, 50;
36 }
37 @cutoffs = sort {$a <=> $b} uniq @cutoffs;
38
39 # Only one of 'strip_N' or 'split_N' is valid
40 if ($strip && $split) {
41 warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n";
42 $strip = 0;
43 }
44
45 # Check for a few necessary conditions
46 die "Can't open FASTA file for reading"
47 if (defined $fn_fasta && ! -r $fn_fasta);
48 die "One or more cutoffs outside valid range (1-99)"
49 if (grep {$_ < 1 || $_ > 99} @cutoffs);
50 die "Cutoffs must be integer values"
51 if (grep {$_ ne int($_)} @cutoffs);
52 my @lens;
53 my $N_sum = 0;
54 my $GC_sum = 0;
55
56 # Read in sequences and calculate descriptive stats
57 my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined
58
59 SEQ:
60 while (my $seq = $stream->next_seq) {
61 my @parts = ($seq);
62 @parts = split(/n+/i, $seq) if $split;
63 for (@parts) {
64 my $Ns = ($_ =~ tr/Nn//);
65 $N_sum += $Ns;
66 $GC_sum += ($_ =~ tr/GCgc//);
67 push @lens, length($_) - $Ns * $strip;
68 }
69 }
70 @lens = sort {$b <=> $a} @lens;
71
72 # Calculate basic stats
73 my $scaffold_count = scalar @lens;
74 my $total_len = sum @lens;
75 my $N_fraction = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100;
76 my $max_length = max @lens;
77 my $min_length = min @lens;
78 my $mean_length = round($total_len/$scaffold_count, 0);
79
80 # GC percentage calculated from non-ambiguous bases only
81 my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100;
82
83 # Calculate Nx (N50, N80, etc) values
84 # For example, N50 is the size of the smallest contig for which it and all
85 # larger contigs contain 50% of the total nucleotides in the database
86 my $cum_length = 0;
87 my @fractions = map {$_/100} @cutoffs;
88 my @Nx;
89
90 LEN:
91 for (@lens) {
92 $cum_length += $_;
93 if ($cum_length/$total_len >= $fractions[0]) {
94 push @Nx, $_;
95 shift @fractions;
96 last LEN if (@fractions < 1);
97 }
98 }
99
100 # Print summary
101 print '-' x 40 . "\n"
102 . "Summary\n"
103 . '-' x 40 . "\n"
104 . "number of scaffolds: $scaffold_count\n"
105 . "total length: $total_len\n"
106 . "average length: $mean_length\n"
107 . "G/C content: $GC_fraction\%\n"
108 . "ambiguous content: $N_fraction\%\n"
109 . "longest scaffold: $max_length\n";
110 for (0..$#Nx) {
111 my $label = sprintf "N%02d", $cutoffs[$_];
112 print "$label: $Nx[$_]\n";
113 }
114 print "shortest scaffold: $min_length\n";
115 print "NOTE: Ns were stripped for above calculations\n" if ($strip);
116 print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split);
117 print '-' x 40 . "\n";
118
119 exit;
120
121 sub round {
122
123 my ($val,$places) = @_;
124 return int($val*10**$places+0.5)/10**$places;
125
126 }
127
128
129 __END__
130
131 =head1 NAME
132
133 summarize_assembly - print basic summary info for a file of assembly scaffolds
134
135 =head1 SYNOPSIS
136
137 summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>]
138
139 =head1 DESCRIPTION
140
141 This program takes a FASTA file and optionally a list of cutoff values as
142 input and prints out summary information about the contigs/scaffolds contained
143 in the file. You can, of course, supply a FASTA file of any sort of nucleic
144 acid sequences, but the summary information makes most sense for contigs from
145 genomic sequencing assemblies.
146
147 =head1 OPTIONS
148
149 =over
150
151 =item B<--fasta> I<filename>
152
153 Specify contig/scaffold file from which to read input (default: STDIN)
154
155 =item B<--cutoffs>
156
157 Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90'
158 will output N50 and N90 values) (default: 50)
159
160 =item B<--strip_N>
161
162 If specified, Ns will be stripped from scaffold sequences before statistics
163 are calculated (default: FALSE)
164
165 =item B<--split_N>
166
167 If specified, scaffold sequences will be split at regions of one or more Ns
168 before statistics are calculated (e.g. to get contig-level stats from a
169 scaffold file). Note that if this flag is specified, the value of '--strip_N'
170 will be ignored. (default: FALSE)
171
172 =item B<--help>
173
174 Display this usage page
175
176 =item B<--version>
177
178 Print version information
179
180 =back
181
182 =head1 CAVEATS AND BUGS
183
184 Please submit bug reports to the issue tracker in the distribution repository.
185
186 =head1 AUTHOR
187
188 Jeremy Volkening (jdv@base2bio.com)
189
190 =head1 LICENSE AND COPYRIGHT
191
192 Copyright 2014-16 Jeremy Volkening
193
194 This program is free software: you can redistribute it and/or modify
195 it under the terms of the GNU General Public License as published by
196 the Free Software Foundation, either version 3 of the License, or
197 (at your option) any later version.
198
199 This program is distributed in the hope that it will be useful,
200 but WITHOUT ANY WARRANTY; without even the implied warranty of
201 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
202 GNU General Public License for more details.
203
204 You should have received a copy of the GNU General Public License
205 along with this program. If not, see <http://www.gnu.org/licenses/>.
206
207 =cut
208