diff summarize_assembly @ 1:2367d00c5182 draft default tip

"planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/b2b_utils commit 9bf8a0462bd44f170c0371b6cae67dd0c3b3da9f-dirty"
author jdv
date Tue, 28 Sep 2021 06:12:40 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/summarize_assembly	Tue Sep 28 06:12:40 2021 +0000
@@ -0,0 +1,208 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use 5.012;
+
+use Getopt::Long;
+use Pod::Usage;
+use List::Util qw/sum max min/;
+use List::MoreUtils qw/uniq/;
+use BioX::Seq::Stream;
+
+my $PROGRAM = 'summarize_assembly';
+my $VERSION = 0.002;
+
+my $fn_fasta;
+my @cutoffs;
+my $strip = 0;
+my $split = 0;
+
+# Collect command-line parameters
+my $err_msg = 'Syntax error: please check your syntax';
+GetOptions(
+    'fasta=s'      => \$fn_fasta,
+    'cutoffs:i{,}' => \@cutoffs,
+    'strip_N'      => \$strip,
+    'split_N'      => \$split,
+    'help'         => sub{ pod2usage( -verbose => 2 ) },
+    'version'      => sub{ print "This is $PROGRAM v$VERSION\n";exit; },
+) or pod2usage( -msg => $err_msg, -verbose => 1 );
+
+# Set default cutoffs if necessary and sort
+if (! scalar @cutoffs) {
+    warn "No cutoff supplied, defaulting to N50\n";
+    push @cutoffs, 50;
+}
+@cutoffs = sort {$a <=> $b} uniq @cutoffs;
+
+# Only one of 'strip_N' or 'split_N' is valid
+if ($strip && $split) {
+    warn "Only one of --strip_N or --split_N is valid, ignoring --strip_N\n";
+    $strip = 0;
+}
+
+# Check for a few necessary conditions
+die "Can't open FASTA file for reading"
+    if (defined $fn_fasta && ! -r $fn_fasta);
+die "One or more cutoffs outside valid range (1-99)"
+    if (grep {$_ < 1 || $_ > 99} @cutoffs);
+die "Cutoffs must be integer values"
+    if (grep {$_ ne int($_)} @cutoffs);
+my @lens;
+my $N_sum  = 0;
+my $GC_sum = 0;
+
+# Read in sequences and calculate descriptive stats
+my $stream = BioX::Seq::Stream->new( $fn_fasta ); #STDIN if undefined
+
+SEQ:
+while (my $seq = $stream->next_seq) {
+    my @parts = ($seq);
+    @parts = split(/n+/i, $seq) if $split;
+    for (@parts) {
+        my $Ns   = ($_ =~ tr/Nn//);
+        $N_sum  += $Ns;
+        $GC_sum += ($_ =~ tr/GCgc//);
+        push @lens, length($_) - $Ns * $strip;
+    }
+}
+@lens = sort {$b <=> $a} @lens;
+
+# Calculate basic stats
+my $scaffold_count = scalar @lens;
+my $total_len   = sum @lens;
+my $N_fraction  = round( $N_sum/($total_len + $N_sum*$strip), 2 )*100;
+my $max_length  = max @lens;
+my $min_length  = min @lens;
+my $mean_length = round($total_len/$scaffold_count, 0);
+
+# GC percentage calculated from non-ambiguous bases only
+my $GC_fraction = round( $GC_sum/($total_len + ($strip - 1)*$N_sum), 2 )*100;
+
+# Calculate Nx (N50, N80, etc) values
+# For example, N50 is the size of the smallest contig for which it and all
+# larger contigs contain 50% of the total nucleotides in the database
+my $cum_length = 0;
+my @fractions  = map {$_/100} @cutoffs;
+my @Nx;
+
+LEN:
+for (@lens) {
+    $cum_length += $_;
+    if ($cum_length/$total_len >= $fractions[0]) {
+        push @Nx, $_;
+        shift @fractions;
+        last LEN if (@fractions < 1);
+    }
+}
+
+# Print summary
+print '-' x 40 . "\n"
+    . "Summary\n"
+    . '-' x 40 . "\n"
+    . "number of scaffolds: $scaffold_count\n"
+    . "total length:        $total_len\n"
+    . "average length:      $mean_length\n"
+    . "G/C content:         $GC_fraction\%\n"
+    . "ambiguous content:   $N_fraction\%\n"
+    . "longest scaffold:    $max_length\n";
+for (0..$#Nx) {
+    my $label = sprintf "N%02d", $cutoffs[$_];
+    print "$label:                 $Nx[$_]\n";
+}
+print "shortest scaffold:   $min_length\n";
+print "NOTE: Ns were stripped for above calculations\n" if ($strip);
+print "NOTE: Scaffolds were split on Ns for above calculations\n" if ($split);
+print '-' x 40 . "\n";
+
+exit;
+
+sub round {
+
+    my ($val,$places) = @_;
+    return int($val*10**$places+0.5)/10**$places;
+
+}
+    
+
+__END__
+
+=head1 NAME
+
+summarize_assembly - print basic summary info for a file of assembly scaffolds
+
+=head1 SYNOPSIS
+
+summarize_assembly [--cutoffs I<cutoff_1> I<cutoff_2> .. I<cutoff_N> --strip_N --split_N ] --fasta I<input_file>]
+
+=head1 DESCRIPTION
+
+This program takes a FASTA file and optionally a list of cutoff values as
+input and prints out summary information about the contigs/scaffolds contained
+in the file. You can, of course, supply a FASTA file of any sort of nucleic
+acid sequences, but the summary information makes most sense for contigs from
+genomic sequencing assemblies.
+
+=head1 OPTIONS
+
+=over
+
+=item B<--fasta> I<filename>
+
+Specify contig/scaffold file from which to read input (default: STDIN)
+
+=item B<--cutoffs>
+
+Space-separated integer list of cutoffs to calculate (e.g. '--cutoffs 50 90'
+will output N50 and N90 values) (default: 50)
+
+=item B<--strip_N>
+
+If specified, Ns will be stripped from scaffold sequences before statistics
+are calculated (default: FALSE)
+
+=item B<--split_N>
+
+If specified, scaffold sequences will be split at regions of one or more Ns
+before statistics are calculated (e.g. to get contig-level stats from a
+scaffold file). Note that if this flag is specified, the value of '--strip_N'
+will be ignored. (default: FALSE)
+
+=item B<--help>
+
+Display this usage page
+
+=item B<--version>
+
+Print version information
+
+=back
+
+=head1 CAVEATS AND BUGS
+
+Please submit bug reports to the issue tracker in the distribution repository.
+
+=head1 AUTHOR
+
+Jeremy Volkening (jdv@base2bio.com)
+
+=head1 LICENSE AND COPYRIGHT
+
+Copyright 2014-16 Jeremy Volkening
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+=cut
+