Mercurial > repos > jdv > b2b_bam2consensus
diff summarize_assembly @ 1:2367d00c5182 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author | jdv |
---|---|
date | Tue, 28 Sep 2021 06:12:40 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/summarize_assembly Tue Sep 28 06:12:40 2021 +0000 @@ -0,0 +1,208 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use 5.012; + +use Getopt::Long; +use Pod::Usage; +use List::Util qw/sum max min/; +use List::MoreUtils qw/uniq/; +use BioX::Seq::Stream; + +my $PROGRAM = 'summarize_assembly'; +my $VERSION = 0.002; + +my $fn_fasta; +my @cutoffs; +my $strip = 0; +my $split = 0; + +# Collect command-line parameters +my $err_msg = 'Syntax error: please check your syntax'; +GetOptions( + 'fasta=s' => \$fn_fasta, + 'cutoffs:i{,}' => \@cutoffs, + 'strip_N' => \$strip, + 'split_N' => \$split, + 'help' => sub{ pod2usage( -verbose => 2 ) }, + 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; }, +) or pod2usage( -msg => $err_msg, -verbose => 1 ); + +# Set default cutoffs if necessary and sort +if (! scalar @cutoffs) { + warn "No cutoff supplied, defaulting to N50\n"; + push @cutoffs, 50; +} +@cutoffs = sort {$a <=> $b} uniq @cutoffs; + +# Only one of 'strip_N' or 'split_N' is valid +if ($strip && $split) { + warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n"; + $strip = 0; +} + +# Check for a few necessary conditions +die "Can't open FASTA file for reading" + if (defined $fn_fasta && ! -r $fn_fasta); +die "One or more cutoffs outside valid range (1-99)" + if (grep {$_ < 1 || $_ > 99} @cutoffs); +die "Cutoffs must be integer values" + if (grep {$_ ne int($_)} @cutoffs); +my @lens; +my $N_sum = 0; +my $GC_sum = 0; + +# Read in sequences and calculate descriptive stats +my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined + +SEQ: +while (my $seq = $stream->next_seq) { + my @parts = ($seq); + @parts = split(/n+/i, $seq) if $split; + for (@parts) { + my $Ns = ($_ =~ tr/Nn//); + $N_sum += $Ns; + $GC_sum += ($_ =~ tr/GCgc//); + push @lens, length($_) - $Ns * $strip; + } +} +@lens = sort {$b <=> $a} @lens; + +# Calculate basic stats +my $scaffold_count = scalar @lens; +my $total_len = sum @lens; +my $N_fraction = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100; +my $max_length = max @lens; +my $min_length = min @lens; +my $mean_length = round($total_len/$scaffold_count, 0); + +# GC percentage calculated from non-ambiguous bases only +my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100; + +# Calculate Nx (N50, N80, etc) values +# For example, N50 is the size of the smallest contig for which it and all +# larger contigs contain 50% of the total nucleotides in the database +my $cum_length = 0; +my @fractions = map {$_/100} @cutoffs; +my @Nx; + +LEN: +for (@lens) { + $cum_length += $_; + if ($cum_length/$total_len >= $fractions[0]) { + push @Nx, $_; + shift @fractions; + last LEN if (@fractions < 1); + } +} + +# Print summary +print '-' x 40 . "\n" + . "Summary\n" + . '-' x 40 . "\n" + . "number of scaffolds: $scaffold_count\n" + . "total length: $total_len\n" + . "average length: $mean_length\n" + . "G/C content: $GC_fraction\%\n" + . "ambiguous content: $N_fraction\%\n" + . "longest scaffold: $max_length\n"; +for (0..$#Nx) { + my $label = sprintf "N%02d", $cutoffs[$_]; + print "$label: $Nx[$_]\n"; +} +print "shortest scaffold: $min_length\n"; +print "NOTE: Ns were stripped for above calculations\n" if ($strip); +print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split); +print '-' x 40 . "\n"; + +exit; + +sub round { + + my ($val,$places) = @_; + return int($val*10**$places+0.5)/10**$places; + +} + + +__END__ + +=head1 NAME + +summarize_assembly - print basic summary info for a file of assembly scaffolds + +=head1 SYNOPSIS + +summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>] + +=head1 DESCRIPTION + +This program takes a FASTA file and optionally a list of cutoff values as +input and prints out summary information about the contigs/scaffolds contained +in the file. You can, of course, supply a FASTA file of any sort of nucleic +acid sequences, but the summary information makes most sense for contigs from +genomic sequencing assemblies. + +=head1 OPTIONS + +=over + +=item B<--fasta> I<filename> + +Specify contig/scaffold file from which to read input (default: STDIN) + +=item B<--cutoffs> + +Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90' +will output N50 and N90 values) (default: 50) + +=item B<--strip_N> + +If specified, Ns will be stripped from scaffold sequences before statistics +are calculated (default: FALSE) + +=item B<--split_N> + +If specified, scaffold sequences will be split at regions of one or more Ns +before statistics are calculated (e.g. to get contig-level stats from a +scaffold file). Note that if this flag is specified, the value of '--strip_N' +will be ignored. (default: FALSE) + +=item B<--help> + +Display this usage page + +=item B<--version> + +Print version information + +=back + +=head1 CAVEATS AND BUGS + +Please submit bug reports to the issue tracker in the distribution repository. + +=head1 AUTHOR + +Jeremy Volkening (jdv@base2bio.com) + +=head1 LICENSE AND COPYRIGHT + +Copyright 2014-16 Jeremy Volkening + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. + +=cut +