Mercurial > repos > jdv > b2b_sync_reads
view summarize_assembly @ 1:b7f66945bf72 draft default tip
"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author | jdv |
---|---|
date | Tue, 28 Sep 2021 06:14:59 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; use 5.012; use Getopt::Long; use Pod::Usage; use List::Util qw/sum max min/; use List::MoreUtils qw/uniq/; use BioX::Seq::Stream; my $PROGRAM = 'summarize_assembly'; my $VERSION = 0.002; my $fn_fasta; my @cutoffs; my $strip = 0; my $split = 0; # Collect command-line parameters my $err_msg = 'Syntax error: please check your syntax'; GetOptions( 'fasta=s' => \$fn_fasta, 'cutoffs:i{,}' => \@cutoffs, 'strip_N' => \$strip, 'split_N' => \$split, 'help' => sub{ pod2usage( -verbose => 2 ) }, 'version' => sub{ print "This is $PROGRAM v$VERSION\n";exit; }, ) or pod2usage( -msg => $err_msg, -verbose => 1 ); # Set default cutoffs if necessary and sort if (! scalar @cutoffs) { warn "No cutoff supplied, defaulting to N50\n"; push @cutoffs, 50; } @cutoffs = sort {$a <=> $b} uniq @cutoffs; # Only one of 'strip_N' or 'split_N' is valid if ($strip && $split) { warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n"; $strip = 0; } # Check for a few necessary conditions die "Can't open FASTA file for reading" if (defined $fn_fasta && ! -r $fn_fasta); die "One or more cutoffs outside valid range (1-99)" if (grep {$_ < 1 || $_ > 99} @cutoffs); die "Cutoffs must be integer values" if (grep {$_ ne int($_)} @cutoffs); my @lens; my $N_sum = 0; my $GC_sum = 0; # Read in sequences and calculate descriptive stats my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined SEQ: while (my $seq = $stream->next_seq) { my @parts = ($seq); @parts = split(/n+/i, $seq) if $split; for (@parts) { my $Ns = ($_ =~ tr/Nn//); $N_sum += $Ns; $GC_sum += ($_ =~ tr/GCgc//); push @lens, length($_) - $Ns * $strip; } } @lens = sort {$b <=> $a} @lens; # Calculate basic stats my $scaffold_count = scalar @lens; my $total_len = sum @lens; my $N_fraction = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100; my $max_length = max @lens; my $min_length = min @lens; my $mean_length = round($total_len/$scaffold_count, 0); # GC percentage calculated from non-ambiguous bases only my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100; # Calculate Nx (N50, N80, etc) values # For example, N50 is the size of the smallest contig for which it and all # larger contigs contain 50% of the total nucleotides in the database my $cum_length = 0; my @fractions = map {$_/100} @cutoffs; my @Nx; LEN: for (@lens) { $cum_length += $_; if ($cum_length/$total_len >= $fractions[0]) { push @Nx, $_; shift @fractions; last LEN if (@fractions < 1); } } # Print summary print '-' x 40 . "\n" . "Summary\n" . '-' x 40 . "\n" . "number of scaffolds: $scaffold_count\n" . "total length: $total_len\n" . "average length: $mean_length\n" . "G/C content: $GC_fraction\%\n" . "ambiguous content: $N_fraction\%\n" . "longest scaffold: $max_length\n"; for (0..$#Nx) { my $label = sprintf "N%02d", $cutoffs[$_]; print "$label: $Nx[$_]\n"; } print "shortest scaffold: $min_length\n"; print "NOTE: Ns were stripped for above calculations\n" if ($strip); print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split); print '-' x 40 . "\n"; exit; sub round { my ($val,$places) = @_; return int($val*10**$places+0.5)/10**$places; } __END__ =head1 NAME summarize_assembly - print basic summary info for a file of assembly scaffolds =head1 SYNOPSIS summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>] =head1 DESCRIPTION This program takes a FASTA file and optionally a list of cutoff values as input and prints out summary information about the contigs/scaffolds contained in the file. You can, of course, supply a FASTA file of any sort of nucleic acid sequences, but the summary information makes most sense for contigs from genomic sequencing assemblies. =head1 OPTIONS =over =item B<--fasta> I<filename> Specify contig/scaffold file from which to read input (default: STDIN) =item B<--cutoffs> Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90' will output N50 and N90 values) (default: 50) =item B<--strip_N> If specified, Ns will be stripped from scaffold sequences before statistics are calculated (default: FALSE) =item B<--split_N> If specified, scaffold sequences will be split at regions of one or more Ns before statistics are calculated (e.g. to get contig-level stats from a scaffold file). Note that if this flag is specified, the value of '--strip_N' will be ignored. (default: FALSE) =item B<--help> Display this usage page =item B<--version> Print version information =back =head1 CAVEATS AND BUGS Please submit bug reports to the issue tracker in the distribution repository. =head1 AUTHOR Jeremy Volkening (jdv@base2bio.com) =head1 LICENSE AND COPYRIGHT Copyright 2014-16 Jeremy Volkening This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. =cut