annotate fasta-stats.pl @ 0:be48db09665c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
author iuc
date Thu, 22 Nov 2018 04:16:11 -0500
parents
children 53c14c29c2fd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
1 #!/usr/bin/env perl
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
2
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
3 # fasta-stats
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
4 # written by torsten.seemann@monash.edu
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
5 # oct 2012
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
6
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
7 use strict;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
8 use warnings;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
9 use List::Util qw(sum min max);
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
10
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
11 # stat storage
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
12
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
13 my $n=0;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
14 my $seq = '';
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
15 my %stat;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
16 my @len;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
17
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
18 # MAIN LOOP collecting sequences
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
19
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
20 while (my $line = <ARGV>) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
21 chomp $line;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
22 if ($line =~ m/^\s*>/) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
23 process($seq) if $n;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
24 $n++;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
25 $seq='';
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
26 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
27 else {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
28 $seq .= $line;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
29 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
30 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
31
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
32 process($seq) if $n;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
33
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
34 # sort length array
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
35 # (should use hash here for efficiency with huge no of short reads?)
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
36
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
37 @len = sort { $a <=> $b } @len;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
38
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
39 # compute more stats
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
40
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
41 $stat{'num_seq'} = scalar(@len);
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
42
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
43 if (@len) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
44 $stat{'num_bp'} = sum(@len);
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
45 $stat{'len_min'} = $len[0];
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
46 $stat{'len_max'} = $len[-1];
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
47 $stat{'len_median'} = $len[int(@len/2)];
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
48 $stat{'len_mean'} = int( $stat{'num_bp'} / $stat{'num_seq'} );
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
49 # calculate n50
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
50
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
51 $stat{'len_N50'} = 0;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
52 my $cum=0;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
53 my $thresh = int 0.5 * $stat{'num_bp'};
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
54 for my $i (0 .. $#len) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
55 $cum += $len[$i];
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
56 if ($cum >= $thresh) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
57 $stat{'len_N50'} = $len[$i];
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
58 last;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
59 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
60 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
61 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
62
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
63 #calculate GC content
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
64 $stat{'num_bp_not_N'} = $stat{'num_G'} + $stat{'num_C'} + $stat{'num_A'} + $stat{'num_T'};
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
65 $stat{'GC_content'} = ($stat{'num_G'} + $stat{'num_C'}) / $stat{'num_bp_not_N'}*100;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
66
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
67 # print stats as .tsv
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
68
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
69 for my $name (sort keys %stat) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
70 if ($name =~ m/GC_content/){
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
71 printf "%s\t%0.1f\n", $name, $stat{$name};
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
72 } else {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
73 printf "%s\t%s\n", $name, $stat{$name};
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
74 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
75 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
76
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
77 # run for each sequence
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
78
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
79 sub process {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
80 my($s) = @_;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
81 # base composition
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
82 for my $x (qw(A G T C N)) {
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
83 my $count = $s =~ s/$x/$x/gi;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
84 $stat{"num_$x"} += $count;
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
85 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
86 # keep list of all lengths encountered
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
87 push @len, length($s);
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
88 }
be48db09665c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit d6a78405947a91659a4168ddb2f1534327f044cb
iuc
parents:
diff changeset
89