Mercurial > repos > davidvanzessen > mutation_analysis
view logo.pm @ 3:a0b27058dcac draft
Uploaded
author | davidvanzessen |
---|---|
date | Wed, 17 Sep 2014 07:25:17 -0400 |
parents | 2f4298673519 |
children |
line wrap: on
line source
#!/usr/local/bin/perl -w =head1 NAME logo.pm - organizes data in FASTA and CLUSTAL formats into height data. =head1 SYNOPSIS Perl module =head1 DESCRIPTION logo.pm: Takes in strings of aligned sequences and sorts them vertically based on height as assigned by the following equations found in Schneider and Stephens paper "Sequence Logos: A New Way to Display Consensus Sequences": height = f(b,l) * R(l) (1) where f(b,l) is the frequency of base or amino acid "b" at position "l". R(l) is amount of information present at position "l" and can be quantified as follows: R(l) for amino acids = log(20) - (H(l) + e(n)) (2a) R(l) for nucleic acids = 2 - (H(l) + e(n)) (2b) where log is taken base 2, H(l) is the uncertainty at position "l", and e(n) is the error correction factor for small "n". H(l) is computed as follows: H(l) = - (Sum f(b,l) * log[ f(b,l) ]) (3) where again, log is taken base 2. f(b,l) is the frequency of base "b" at position "l". The sum is taken over all amino acids or bases, depending on which the data is. Currently, logo.pm uses an approximation for e(n), given by: e(n) = (s-1) / (2 * ln2 * n) (4) Where s is 4 for nucleotides, 20 for amino acids ; n is the number of sequences in the alignment. e(n) also gives the height of error bars. =cut package logo; use strict; use Carp; ################################################################################ ###### SOME VARIABLES ###### ################################################################################ my $DEBUG = 0; my $AA = 0; my $NA = 1; my %BASES = ("a" => "adenine", "t" => "thymine", "g" => "guanine", "c" => "cytosine", "u" => "uracil"); # does not include B or Z my %AMINOACIDS = ("a" => "", "c" => "", "d" => "", "e" => "", "f" => "", "g" => "", "h" => "", "i" => "", "k" => "", "l" => "", "m" => "", "n" => "", "p" => "", "q" => "", "r" => "", "s" => "", "t" => "", "v" => "", "w" => "", "y" => ""); my @data; my $kind; my ($seqs_r, $desc_r); my $CONFIDENCE_LIMIT = 0.90; ################################################################################ ###### SOME FUNCTIONS ###### ################################################################################ =head1 APPENDIX =cut =head2 getHeightData() Usage : my ($height_data_r, $description_data_r, $kind) = logo::getHeightData($input_data_r, $params); Returns : * REFERENCE TO array of height data * REFERENCE TO array of input description strings * $AA if the data is amino acid, $NA otherise Args : $input_data_r : input data in CLUSTAL or FASTA formats : $params : hash of parameters getHeightData is the entry point into the logo.pm module. $input_data_r is a reference to an array of strings containing FASTA or CLUSTAL data, where all lines whose first character is "#" is considered a comment line. $params is a hash of parameters with the following keys: * smallsampletoggle : 0 to turn off small sample correction, otherwise small sample correction is on * input_kind : 0 for amino acids, 1 for nucleic acids; if undefined, logo.pm will attempt to automatically detect whether the input consists of amino or nucleic acid data. If $input_kind is defined, only those residues defined by $input_kind will be in the output -- all other residues are considered as spaces. For example, if $input_kind is $NA, the residue "I" or "i" are considered spaces, since "I" and "i" are not nucleic acid residues. * stretch : stretch all characters so they are flush at the maximum number of bits allowed Sample use: # get FASTA data open (FASTA, "$fastafile"); my @inputdata = <FASTA>; close (FASTA); my %heightparams = ( smallsamplecorrection => 0, input_kind => 0, stretch => 0 ); # get height data my ($heightdata_r, $desc_r, $kind) = logo::getHeightData(\@inputdata, \%heightparams); =cut # entry point into module sub getHeightData { # $smallsampletoggle is toggle to turn off small sample correction (1 to turn off) # $input_kind can be $AA or $NA or undef my ($input_data_r, $params) = @_; # gary 040119: adjust for formats (Unix is \n, Mac is \r, Windows is \r\n) $input_data_r = normalizeData($input_data_r); # gets sequences, sets $kind temporarily my ($goodlength, $maxreslength, $badline, $validformat); ($seqs_r, $desc_r, $maxreslength, $goodlength, $badline, $validformat) = getSeqs($input_data_r, $params->{input_kind}); # for(my $i = 0; $i < scalar @$seqs_r ; $i++) { # print STDERR ($desc_r->[$i] . "\n" . $seqs_r->[$i] . "\n"); # } # print STDERR "maxreslength = $maxreslength\n"; # # exit(0); if ($DEBUG) { print STDERR ("point 1\n");} # check for valid format if ((defined $validformat) && ($validformat == 1)) { # print("returning\n"); return (undef, undef, undef, undef, undef, 1); } if ($DEBUG) { print STDERR ("point 2\n");} # check for bad length if (!$goodlength) { return (undef, undef, undef, $goodlength, $badline); } # reset $kind if in $input_kind if (defined $params->{input_kind} && isLegalKind($params->{input_kind}) ) { $kind = $params->{input_kind}; } # build data buildData(@$seqs_r, $params->{smallsampletoggle}, $params->{stretch}, $maxreslength); if ($DEBUG) { print STDERR ("point 3\n");} # print STDERR ("data size = ", scalar @data, "\n"); # foreach (@data) { # print STDERR ("$_\n"); # } # # exit(0); # # print STDERR ("return at 2\n"); return (\@data, $desc_r, $kind, $goodlength, $badline); } sub isLegalKind { return ($_[0] =~ /^[01]$/); } ################################################################################ # # sub normalizeData($data_r) returns $data_r, with Mac/Unix/Windows newline # style normalized to standard Unix-style newline style # ################################################################################ sub normalizeData { my ($data_r) = @_; # check args if (not defined $data_r) { die "data_r must be defined\n"; } my @normalized = (); foreach my $pseudo_line (@$data_r) { my @split_line = split(/[\r\n]+/, $pseudo_line); push(@normalized, @split_line); } return \@normalized; } ################################################################################ # # sub getSeqs($data_r, $kind) returns 5 values: # # * array reference to sequence strings # * array reference to sequence names # * length of sequence # * 1 if all sequences have the same length, 0 else # * line number L where sequenceLength(L) != sequenceLength(other lines), else # undef # ################################################################################ sub getSeqs { my ($input_data_r, $kind) = @_; unless( $input_data_r->[0] ){ return (undef, undef, undef, undef, undef, 1); } # skip all comment chars and lines of all spaces while ( ($input_data_r->[0] =~ /^\s*\#/) || ($input_data_r->[0] =~ /^\s*$/) ) { shift @$input_data_r; if( !defined $input_data_r->[0]) {return (undef, undef, undef, undef, undef, 1);} } if (isFormat_FASTA($input_data_r)) { return getSeqs_FASTA($input_data_r, $kind); } elsif (isFormat_CLUSTAL($input_data_r)) { return getSeqs_CLUSTAL($input_data_r, $kind); } elsif (isFormat_FLAT($input_data_r)) { return getSeqs_FLAT($input_data_r, $kind); } else { if ($DEBUG) {print STDERR ("format nothing\n");} return (undef, undef, undef, undef, undef, 1); } # if ($_[0] =~ />/) { # return getSeqs_FASTA(@_); # } else { # return getSeqs_CLUSTAL(@_); # } } ################################################################################ # # sub isFormat_FASTA($data_r) returns 1 if $data_r is in FASTA format # ################################################################################s sub isFormat_FASTA { my ($input_data_r) = @_; # check args if (not defined $input_data_r) { Carp::confess("logo::isFormat_FASTA : input_data_r must be defined\n"); } if ($input_data_r->[0] =~ />/) { return 1; } else { return 0; } } ################################################################################ # # sub isFormat_CLUSTAL($data_r) returns 1 if $data_r is in CLUSTAL format # ################################################################################ sub isFormat_CLUSTAL { my ($input_data_r) = @_; # check args if (not defined $input_data_r) { Carp::confess("logo::isFormat_CLUSTAL : input_data_r must be " . "defined\n"); } my $i=0; # # skip spaces or just "*" and "." and ":" # while ($input_data_r->[$i] =~ /^[\*\:\s]*$/) { # $i++; # } # if it looks like CLUSTAL W (version) ... , then it must be clustal if ($input_data_r->[$i] =~ /^\s*CLUSTAL/) { return 1; } # CLUSTAL looks like: "name seq" if ($input_data_r->[$i] =~ /^\s*(\S+)\s+(\S+)\s*$/) { return 1; } else { return 0; } } ################################################################################ # # sub isFormat_FLAT($data_r) returns 1 if $data_r is in FLAT format # ################################################################################ sub isFormat_FLAT { my ($input_data_r) = @_; # check args if (not defined $input_data_r) { Carp::confess("logo::isFormat_FLAT : input_data_r must be defined\n"); } # print("checking flat\n"); # print("first element = -->", $input_data_r->[0], "<--\n"); if ($input_data_r->[0] =~ /^[a-zA-Z\-]+\s*$/) { return 1; } else { return 0; } } ################################################################################ ###### FORMATTING FUNCTIONS ###### ################################################################################ # the flat sequence format is as follows: # sequence1 # sequence2 # sequence3 # ... # sequenceN sub getSeqs_FLAT { if ($DEBUG) {print STDERR "DOING FLAT\n";} my ($input_data_r, $input_kind) = @_; my $linelength = 0; my $seqCount = 0; my $total_residues = 0; my (@returnVal, @desc) = (); my $prevlinelength = undef; my $NA_count = 0; foreach my $seq (@$input_data_r) { # chomp $seq; $seq =~ s/\s+$//; my @chars = split(//,$seq); my $char; foreach (@chars) { $total_residues++; $linelength++; # set $char if (defined $input_kind) { if ($input_kind == $AA) { $char = (isAA($_)) ? $_ : "-"; } else { # == $NA $char = (isNA($_)) ? $_ : "-"; } } else { $char = $_; if (isNA($char)) { $NA_count++; } } $returnVal[$seqCount] .= $char; } $desc[$seqCount] = "no name"; if ($seqCount == 0) { $prevlinelength = $linelength; } elsif ($prevlinelength != $linelength) { # different number of residues, so complain return (undef, undef, undef, 0, $seq); # 0 for not same length, $seq is name } $linelength=0; $seqCount++; } # determine whether to use $NA or $AA if (!defined $input_kind) { if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { $kind = $NA; } else { $kind = $AA; } } return (\@returnVal, \@desc, $prevlinelength, 1, undef); } sub getSeqs_CLUSTAL { if ($DEBUG) {print STDERR "DOING CLUSTAL\n";} my ($input_data_r, $input_kind) = @_; my @returnVal; my @desc; my $seqCount=0; my $reslength = 0; my ($name, $seq); # my $input_kind = pop @_; # my $CONFIDENCE_LIMIT = 0.90; my $NA_count = 0; my $total_residues = 0; my ($prevlinelength, $linelength) = (0,0); # foreach (@_) { foreach (@$input_data_r) { # chomp; if ($DEBUG) {print STDERR ("line = $_\n")}; $_ =~ s/\s+$//; # skip if it is a comment character -- first character is "#" next if (/^\s*\#/); # skil if it is a CLUSTAL W header line next if (/^\s*CLUSTAL/); # if spaces or just "*" and "." and ":" if (/^[\*\.\:\s]*$/) { $seqCount=0; $prevlinelength=0; next; } ($name,$seq) = (/^\s*(\S+)\s+(\S+)\s*$/); if ($DEBUG) { print STDERR ("name, seq = $name, $seq\n"); } # add new entry if (!defined $desc[$seqCount]) { $desc[$seqCount] = $name; $returnVal[$seqCount] = ""; } if(!defined $seq) {return (undef, undef, undef, undef, undef, 1);} # Something has gone terrible wrong my @chars = split(//,$seq); my $char; foreach (@chars) { if ($seqCount == 0) { $reslength++; # all sequences have same residue length, so only count first one } $total_residues++; $linelength++; # set $char if (defined $input_kind) { if ($input_kind == $AA) { $char = (isAA($_)) ? $_ : "-"; } else { # == $NA $char = (isNA($_)) ? $_ : "-"; } } else { $char = $_; if (isNA($char)) { $NA_count++; } } $returnVal[$seqCount] .= $char; } if ($seqCount == 0) { $prevlinelength = $linelength; } elsif ($prevlinelength != $linelength) { # different number of residues, so complain return (undef, undef, undef, 0, $name); } $linelength=0; $seqCount++; } # determine whether to use $NA or $AA if (!defined $input_kind ) { if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { $kind = $NA; } else { $kind = $AA; } } return (\@returnVal, \@desc, $reslength, 1, undef); } # if $input_kind is defined, residues that are not defined are set to space sub getSeqs_FASTA { if ($DEBUG) {print STDERR "DOING FASTA\n";} my ($input_data_r, $input_kind) = @_; my @returnVal; my @desc; my $count=-1; my $newElem=0; # my $input_kind = pop @_; # my $CONFIDENCE_LIMIT = 0.90; my $NA_count = 0; my $total_residues = 0; my $reslength = 0; my $maxreslength = 0; my ($goodlength, $currline, $prevline); # # skip all lines that are all spaces # while ($_[0] =~ /^\s*$/) { # shift @_; # } # foreach (@_) { foreach (@$input_data_r) { $_ =~ s/\s+$//; # skip if it is a comment character -- first character is "#" next if (/^\s*\#/); # skip all lines that are all spaces next if (/^\s*$/); $_ =~ s/\s+$//; # cut trailing white space $_ =~ s/^\s+//; # cut leading white space if (/>/) { $currline = $_; ($desc[scalar @desc]) = ($_ =~ />\s*(.+)$/); if (not $newElem) { $count++; $newElem = 1; } } else { if ($newElem) { $maxreslength = $reslength if $maxreslength == 0; if (($maxreslength != 0) && ($maxreslength != $reslength)) { return (undef, undef, undef, 0, $prevline); } $maxreslength = $reslength; $reslength = 0; } my @chars = split(//,$_); my $char; foreach (@chars) { $reslength++; $total_residues++; # set $char if (defined $input_kind) { if ($input_kind == $AA) { $char = (isAA($_)) ? $_ : "-"; } else { # == $NA $char = (isNA($_)) ? $_ : "-"; } } else { $char = ($_ =~ /[a-zA-Z]/) ? $_ : "-"; # if not alpha char, use space if (isNA($char) && !isSpace($char)) { $NA_count++; } } if ($newElem) { $returnVal[$count] = $char; } else { $returnVal[$count] .= $char; } $newElem = 0; } $prevline = $currline if $currline =~ />/; } } # check if last is biggest if (($maxreslength != 0) && ($maxreslength != $reslength)) { return (undef, undef, undef, 0, $prevline); } # $maxreslength = ($reslength > $maxreslength) ? $reslength : $maxreslength; # determine whether to use $NA or $AA if (!defined $input_kind) { if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { $kind = $NA; } else { $kind = $AA; } } return (\@returnVal, \@desc, $maxreslength || $reslength, 1, undef); # 1 for good lengths } sub isSpace { return $_[0] =~ /[ \-]/; } sub isAA { return (defined $AMINOACIDS{lc $_[0]}); } sub isNA { return (defined $BASES{lc $_[0]}); } ################################################################################ ###### DATA BUILDING FUNCTIONS ###### ################################################################################ # arguments: takes reference to array and lines aligned sequences of bases or # amino acids # returns: updated reference array to reflect contents of sequences sorted # vertically by height as described by (1) sub buildData { my $currentx = 0; my $h; my $count=0; my $maxreslength = pop (@_); my $stretch = pop(@_); my $smallsampletoggle = pop (@_); my $totalsize = $#_+1; while ($currentx < $maxreslength) { #(length $_[0])) { my $allspaces = 1; my $spaceCount=0; # get vertical sequence my @vert=(); foreach (@_) { # foreach sequence my $currentchar; # set currentchar, set to " " if $_ is not long enough if ($currentx >= (length $_)) { $currentchar = " "; } else { $currentchar = substr($_,$currentx,1); } # in all sequences, "-" is considered as a space # don't count " " and "-" if (($currentchar ne "-") && ($currentchar ne " ")) { $vert[scalar @vert] = uc substr($_,$currentx,1); $allspaces = 0; } else { $spaceCount++; } } if ($allspaces) { # build @vert @vert = (" 0", ">0"); # place in @data $data[scalar @data] = \@vert; $currentx++; next; } my $error; if ($smallsampletoggle) { $error=getError($kind,$totalsize - $spaceCount); } else { $error = 0; } # sort vertical sequence by amino acid or base @vert = sort(@vert); my $total = $#vert + 1; # find H(l) -- must be done before collapsing $h = getH(@vert); # collect like terms @vert = collapse(@vert); # get R(l) my $r; if (!defined $stretch || !$stretch) { $r= getR($kind, $h, $error); } else { $r = ($kind == $NA) ? 2 : (log(20) / log(2)); } # place heights my $count=0; my $height; my $elem; foreach $elem (@vert) { $height = getHeight(substr($elem, 2) / $total,$r); $vert[$count] = substr($elem,0,1) . $height; $count++; } # sort by height @vert = sort height_sort @vert; # put in error bar size $vert[$count] = ">$error"; # place in @data $data[scalar @data] = \@vert; $currentx++; } } # uses error approximation given by: # e := (s-1) / (2 * ln2 * ntrue); sub getError { return ((($_[0] == $NA) ? 4 : 20) - 1) / (2 * log(2) * $_[1]); } sub height_sort { my ($lettera, $heighta) = ($a =~ /^(.{1})(\S+)$/); #substr($a,1); my ($letterb, $heightb) = ($b =~ /^(.{1})(\S+)$/); #substr($b,1); # compare by height first, then letter if ($heighta <=> $heightb) { return $heighta <=> $heightb; } else { return $letterb cmp $lettera; #reversed for some reason... } } sub collapse { my @returnVal; my $current = $_[0]; my $count=0; my $freq; foreach (@_) { if ($current eq $_) { $count++; } else { $returnVal[scalar @returnVal] = "$current $count"; $current = $_; $count=1; } } # add last element $returnVal[scalar @returnVal] = "$current $count"; return @returnVal; } # arguments : $_[0] : list of bases or amino acids sub getH { my $h = 0; my (@vert) = @_; # vertical sequence (comparing multiple sequences) my $current = $vert[0]; my $count=0; my $freq; foreach (@vert) { if ($current eq $_) { $count++; } else { $freq = $count / ($#vert + 1); $h += $freq * (log ($freq) / log(2)); $current = $_; $count=1; } } # add last element $freq = $count / ($#vert + 1); $h += $freq * (log ($freq) / log(2)); return -$h; } # arguments : $_[0] : $AA or $NA # $_[1] : uncertainty = H(l) # $_[2] : error correction factor for small number of sequences # = e(n) ; assummed to be 0 if not given sub getR { my $max = ($_[0] == $NA) ? 2 : (log(20) / log(2)); my $e = (defined $_[2]) ? $_[2] : 0; return ($max - ($_[1] + $e)); } # arguments: $_[0] : frequency = f(b,l) # $_[1] : R(l) sub getHeight { return $_[0] * $_[1] } 1;