mutation_analysis: logo.pm comparison

comparison logo.pm @ 2:2f4298673519 draft

Uploaded

author	davidvanzessen
date	Wed, 10 Sep 2014 10:33:29 -0400
parents
children

comparison

equal deleted inserted replaced

-:856b5b718d21
+:2f4298673519
+#!/usr/local/bin/perl -w
+=head1 NAME
+logo.pm - organizes data in FASTA and CLUSTAL formats into height data.
+=head1 SYNOPSIS
+Perl module
+=head1 DESCRIPTION
+logo.pm: Takes in strings of aligned sequences and sorts them vertically
+based on height as assigned by the following equations found in
+Schneider and Stephens paper "Sequence Logos: A New Way to Display
+Consensus Sequences":
+height = f(b,l) * R(l)                            (1)
+where f(b,l) is the frequency of base or amino acid "b" at position
+"l". R(l) is amount of information present at position "l" and can
+be quantified as follows:
+R(l) for amino acids   = log(20) - (H(l) + e(n))    (2a)
+R(l) for nucleic acids =    2    - (H(l) + e(n))    (2b)
+where log is taken base 2, H(l) is the uncertainty at position "l",
+and e(n) is the error correction factor for small "n". H(l) is
+computed as follows:
+H(l) = - (Sum f(b,l) * log[ f(b,l) ])             (3)
+where again, log is taken base 2. f(b,l) is the frequency of base
+"b" at position "l". The sum is taken over all amino acids or
+bases, depending on which the data is.
+Currently, logo.pm uses an approximation for e(n), given by:
+e(n) = (s-1) / (2 * ln2 * n)                      (4)
+Where s is 4 for nucleotides, 20 for amino acids ; n is the number
+of sequences in the alignment. e(n) also  gives the height of error
+bars.
+=cut
+package logo;
+use strict;
+use Carp;
+################################################################################
+######                             SOME VARIABLES                         ######
+################################################################################
+my $DEBUG = 0;
+my $AA = 0;
+my $NA = 1;
+my %BASES = ("a" => "adenine",
+	     "t" => "thymine",
+	     "g" => "guanine",
+	     "c" => "cytosine",
+	     "u" => "uracil");
+# does not include B or Z
+my %AMINOACIDS = ("a" => "", "c" => "", "d" => "", "e" => "", "f" => "",
+		  "g" => "", "h" => "", "i" => "", "k" => "", "l" => "",
+		  "m" => "", "n" => "", "p" => "", "q" => "", "r" => "",
+		  "s" => "", "t" => "", "v" => "", "w" => "", "y" => "");
+my @data;
+my $kind;
+my ($seqs_r, $desc_r);
+my $CONFIDENCE_LIMIT = 0.90;
+################################################################################
+######                             SOME FUNCTIONS                         ######
+################################################################################
+=head1 APPENDIX
+=cut
+=head2 getHeightData()
+Usage   : my ($height_data_r, $description_data_r, $kind) =
+logo::getHeightData($input_data_r, $params);
+Returns : * REFERENCE TO array of height data
+* REFERENCE TO array of input description strings
+* $AA if the data is amino acid, $NA otherise
+Args    : $input_data_r : input data in CLUSTAL or FASTA formats
+: $params       : hash of parameters
+getHeightData is the entry point into the logo.pm module. $input_data_r is a
+reference to  an array of strings containing FASTA or CLUSTAL data, where all
+lines whose first character is "#" is considered a comment line.
+$params is a hash of parameters with the following keys:
+* smallsampletoggle : 0 to turn off small sample correction, otherwise
+small sample correction is on
+* input_kind : 0 for amino acids, 1 for nucleic acids; if undefined,
+logo.pm will attempt to automatically detect whether the
+input consists of amino or nucleic acid data. If
+$input_kind is defined, only those residues  defined by
+$input_kind will be in the output -- all other residues are
+considered as spaces. For example, if $input_kind is $NA,
+the residue "I" or "i" are considered spaces, since "I" and
+"i" are not nucleic acid residues.
+* stretch : stretch all characters so they are flush at the maximum number
+of bits allowed
+Sample use:
+# get FASTA data
+open (FASTA, "$fastafile");
+my @inputdata = <FASTA>;
+close (FASTA);
+my %heightparams = (
+		       smallsamplecorrection => 0,
+		       input_kind => 0,
+		       stretch => 0
+		       );
+# get height data
+my ($heightdata_r, $desc_r, $kind) = logo::getHeightData(\@inputdata, \%heightparams);
+=cut
+# entry point into module
+sub getHeightData {
+# $smallsampletoggle is toggle to turn off small sample correction (1 to turn off)
+# $input_kind can be $AA or $NA or undef
+my ($input_data_r, $params) = @_;
+# gary 040119: adjust for formats (Unix is \n, Mac is \r, Windows is \r\n)
+$input_data_r = normalizeData($input_data_r);
+# gets sequences, sets $kind temporarily
+my ($goodlength, $maxreslength, $badline, $validformat);
+($seqs_r, $desc_r, $maxreslength, $goodlength, $badline, $validformat) =
+	getSeqs($input_data_r, $params->{input_kind});
+#    for(my $i = 0; $i < scalar @$seqs_r ; $i++) {
+#	print STDERR ($desc_r->[$i] . "\n" . $seqs_r->[$i] . "\n");
+#    }
+#    print STDERR "maxreslength = $maxreslength\n";
+#
+#    exit(0);
+if ($DEBUG) { print STDERR ("point 1\n");}
+# check for valid format
+if ((defined $validformat) && ($validformat == 1)) {
+#	print("returning\n");
+	return (undef, undef, undef, undef, undef, 1);
+}
+if ($DEBUG) { print STDERR ("point 2\n");}
+# check for bad length
+if (!$goodlength) {
+	return (undef, undef, undef, $goodlength, $badline);
+}
+# reset $kind if in $input_kind
+if (defined $params->{input_kind} && isLegalKind($params->{input_kind}) ) {
+	$kind = $params->{input_kind};
+}
+# build data
+buildData(@$seqs_r, $params->{smallsampletoggle}, $params->{stretch}, $maxreslength);
+if ($DEBUG) { print STDERR ("point 3\n");}
+#    print STDERR ("data size = ", scalar @data, "\n");
+#    foreach (@data) {
+#	print STDERR ("$_\n");
+#    }
+#
+#    exit(0);
+#
+#    print STDERR ("return at 2\n");
+return (\@data, $desc_r, $kind, $goodlength, $badline);
+}
+sub isLegalKind {
+return ($_[0] =~ /^[01]$/);
+}
+################################################################################
+#
+# sub normalizeData($data_r) returns $data_r, with Mac/Unix/Windows newline
+# style normalized to standard Unix-style newline style
+#
+################################################################################
+sub normalizeData {
+my ($data_r) = @_;
+# check args
+if (not defined $data_r) {
+	die "data_r must be defined\n";
+}
+my @normalized = ();
+foreach my $pseudo_line (@$data_r) {
+	my @split_line = split(/[\r\n]+/, $pseudo_line);
+	push(@normalized, @split_line);
+}
+return \@normalized;
+}
+################################################################################
+#
+# sub getSeqs($data_r, $kind) returns 5 values:
+#
+# * array reference to sequence strings
+# * array reference to sequence names
+# * length of sequence
+# * 1 if all sequences have the same length, 0 else
+# * line number L where sequenceLength(L) != sequenceLength(other lines), else
+#   undef
+#
+################################################################################
+sub getSeqs {
+my ($input_data_r, $kind) = @_;
+unless( $input_data_r->[0] ){
+return (undef, undef, undef, undef, undef, 1);
+}
+# skip all comment chars and lines of all spaces
+while ( ($input_data_r->[0] =~ /^\s*\#/) || ($input_data_r->[0] =~ /^\s*$/) ) {
+	shift @$input_data_r;
+	if( !defined $input_data_r->[0])  {return (undef, undef, undef, undef, undef, 1);}
+}
+if (isFormat_FASTA($input_data_r)) {
+	return getSeqs_FASTA($input_data_r, $kind);
+} elsif (isFormat_CLUSTAL($input_data_r)) {
+	return getSeqs_CLUSTAL($input_data_r, $kind);
+} elsif (isFormat_FLAT($input_data_r)) {
+	return getSeqs_FLAT($input_data_r, $kind);
+} else {
+	if ($DEBUG) {print STDERR ("format nothing\n");}
+	return (undef, undef, undef, undef, undef, 1);
+}
+#    if ($_[0] =~ />/) {
+#	return getSeqs_FASTA(@_);
+#    } else {
+#	return getSeqs_CLUSTAL(@_);
+#    }
+}
+################################################################################
+#
+# sub isFormat_FASTA($data_r) returns 1 if $data_r is in FASTA format
+#
+################################################################################s
+sub isFormat_FASTA {
+my ($input_data_r) = @_;
+# check args
+if (not defined $input_data_r) {
+	Carp::confess("logo::isFormat_FASTA : input_data_r must be defined\n");
+}
+if ($input_data_r->[0] =~ />/) {
+	return 1;
+} else {
+	return 0;
+}
+}
+################################################################################
+#
+# sub isFormat_CLUSTAL($data_r) returns 1 if $data_r is in CLUSTAL format
+#
+################################################################################
+sub isFormat_CLUSTAL {
+my ($input_data_r) = @_;
+# check args
+if (not defined $input_data_r) {
+	Carp::confess("logo::isFormat_CLUSTAL : input_data_r must be " .
+		      "defined\n");
+}
+my $i=0;
+#    # skip spaces or just "*" and "." and ":"
+#    while ($input_data_r->[$i] =~ /^[\*\:\s]*$/) {
+#	$i++;
+#    }
+# if it looks like CLUSTAL W (version) ... , then it must be clustal
+if ($input_data_r->[$i] =~ /^\s*CLUSTAL/) {
+	return 1;
+}
+# CLUSTAL looks like: "name        seq"
+if ($input_data_r->[$i] =~ /^\s*(\S+)\s+(\S+)\s*$/) {
+	return 1;
+} else {
+	return 0;
+}
+}
+################################################################################
+#
+# sub isFormat_FLAT($data_r) returns 1 if $data_r is in FLAT format
+#
+################################################################################
+sub isFormat_FLAT {
+my ($input_data_r) = @_;
+# check args
+if (not defined $input_data_r) {
+	Carp::confess("logo::isFormat_FLAT : input_data_r must be defined\n");
+}
+#    print("checking flat\n");
+#    print("first element = -->", $input_data_r->[0], "<--\n");
+if ($input_data_r->[0] =~ /^[a-zA-Z\-]+\s*$/) {
+	return 1;
+} else {
+	return 0;
+}
+}
+################################################################################
+######                          FORMATTING FUNCTIONS                      ######
+################################################################################
+# the flat sequence format is as follows:
+# sequence1
+# sequence2
+# sequence3
+# ...
+# sequenceN
+sub getSeqs_FLAT {
+if ($DEBUG) {print STDERR "DOING FLAT\n";}
+my ($input_data_r, $input_kind) = @_;
+my $linelength = 0;
+my $seqCount = 0;
+my $total_residues = 0;
+my (@returnVal, @desc) = ();
+my $prevlinelength = undef;
+my $NA_count = 0;
+foreach my $seq (@$input_data_r) {
+#	chomp $seq;
+	$seq =~ s/\s+$//;
+	my @chars = split(//,$seq);
+	my $char;
+	foreach (@chars) {
+	    $total_residues++;
+	    $linelength++;
+	    # set $char
+	    if (defined $input_kind) {
+		if ($input_kind == $AA) {
+		    $char = (isAA($_)) ? $_ : "-";
+		} else { # == $NA
+		    $char = (isNA($_)) ? $_ : "-";
+		}
+	    } else {
+		$char = $_;
+		if (isNA($char)) {
+		    $NA_count++;
+		}
+	    }
+	    $returnVal[$seqCount] .= $char;
+	}
+	$desc[$seqCount] = "no name";
+	if ($seqCount == 0) {
+	    $prevlinelength = $linelength;
+	} elsif ($prevlinelength != $linelength) {  # different number of residues, so complain
+	    return (undef, undef, undef, 0, $seq);  # 0 for not same length, $seq is name
+	}
+	$linelength=0;
+	$seqCount++;
+}
+# determine whether to use $NA or $AA
+if (!defined $input_kind) {
+	if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
+	    $kind = $NA;
+	} else {
+	    $kind = $AA;
+	}
+}
+return (\@returnVal, \@desc, $prevlinelength, 1, undef);
+}
+sub getSeqs_CLUSTAL {
+if ($DEBUG) {print STDERR "DOING CLUSTAL\n";}
+my ($input_data_r, $input_kind) = @_;
+my @returnVal;
+my @desc;
+my $seqCount=0;
+my $reslength = 0;
+my ($name, $seq);
+#    my $input_kind = pop @_;
+#    my $CONFIDENCE_LIMIT = 0.90;
+my $NA_count = 0;
+my $total_residues = 0;
+my ($prevlinelength, $linelength) = (0,0);
+#    foreach (@_) {
+foreach (@$input_data_r) {
+#	chomp;
+	if ($DEBUG) {print STDERR ("line = $_\n")};
+	$_ =~ s/\s+$//;
+	# skip if it is a comment character -- first character is "#"
+	next if (/^\s*\#/);
+	# skil if it is a CLUSTAL W header line
+	next if (/^\s*CLUSTAL/);
+	# if spaces or just "*" and "." and ":"
+	if (/^[\*\.\:\s]*$/) {
+	    $seqCount=0;
+	    $prevlinelength=0;
+	    next;
+	}
+	($name,$seq) = (/^\s*(\S+)\s+(\S+)\s*$/);
+	if ($DEBUG) { print STDERR ("name, seq = $name, $seq\n"); }
+	# add new entry
+	if (!defined $desc[$seqCount]) {
+	    $desc[$seqCount] = $name;
+	    $returnVal[$seqCount] = "";
+	}
+if(!defined $seq) {return (undef, undef, undef, undef, undef, 1);} # Something has gone terrible wrong
+	my @chars = split(//,$seq);
+	my $char;
+	foreach (@chars) {
+	    if ($seqCount == 0) {
+		$reslength++;     # all sequences have same residue length, so only count first one
+	    }
+	    $total_residues++;
+	    $linelength++;
+	    # set $char
+	    if (defined $input_kind) {
+		if ($input_kind == $AA) {
+		    $char = (isAA($_)) ? $_ : "-";
+		} else { # == $NA
+		    $char = (isNA($_)) ? $_ : "-";
+		}
+	    } else {
+		$char = $_;
+		if (isNA($char)) {
+		    $NA_count++;
+		}
+	    }
+	    $returnVal[$seqCount] .= $char;
+	}
+	if ($seqCount == 0) {
+	    $prevlinelength = $linelength;
+	} elsif ($prevlinelength != $linelength) {  # different number of residues, so complain
+	    return (undef, undef, undef, 0, $name);
+	}
+	$linelength=0;
+	$seqCount++;
+}
+# determine whether to use $NA or $AA
+if (!defined $input_kind ) {
+	if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
+	    $kind = $NA;
+	} else {
+	    $kind = $AA;
+	}
+}
+return (\@returnVal, \@desc, $reslength, 1, undef);
+}
+# if $input_kind is defined, residues that are not defined are set to space
+sub getSeqs_FASTA {
+if ($DEBUG) {print STDERR "DOING FASTA\n";}
+my ($input_data_r, $input_kind) = @_;
+my @returnVal;
+my @desc;
+my $count=-1;
+my $newElem=0;
+#    my $input_kind = pop @_;
+#    my $CONFIDENCE_LIMIT = 0.90;
+my $NA_count = 0;
+my $total_residues = 0;
+my $reslength = 0;
+my $maxreslength = 0;
+my ($goodlength, $currline, $prevline);
+#    # skip all lines that are all spaces
+#    while ($_[0] =~ /^\s*$/) {
+#	shift @_;
+#    }
+#    foreach (@_) {
+foreach (@$input_data_r) {
+	$_ =~ s/\s+$//;
+	# skip if it is a comment character -- first character is "#"
+	next if (/^\s*\#/);
+	# skip all lines that are all spaces
+	next if (/^\s*$/);
+	$_ =~ s/\s+$//;  # cut trailing white space
+	$_ =~ s/^\s+//;  # cut leading white space
+	if (/>/) {
+	    $currline = $_;
+	    ($desc[scalar @desc]) = ($_ =~ />\s*(.+)$/);
+	    if (not $newElem) {
+		$count++;
+		$newElem = 1;
+	    }
+	} else {
+	    if ($newElem) {
+		$maxreslength = $reslength if $maxreslength == 0;
+		if (($maxreslength != 0) && ($maxreslength != $reslength)) {
+		    return (undef, undef, undef, 0, $prevline);
+		}
+		$maxreslength = $reslength;
+		$reslength = 0;
+	    }
+	    my @chars = split(//,$_);
+	    my $char;
+	    foreach (@chars) {
+		$reslength++;
+		$total_residues++;
+		# set $char
+		if (defined $input_kind) {
+		    if ($input_kind == $AA) {
+			$char = (isAA($_)) ? $_ : "-";
+		    } else { # == $NA
+			$char = (isNA($_)) ? $_ : "-";
+		    }
+		} else {
+		    $char = ($_ =~ /[a-zA-Z]/) ? $_ : "-";  # if not alpha char, use space
+		    if (isNA($char) && !isSpace($char)) {
+			$NA_count++;
+		    }
+		}
+		if ($newElem) {
+		    $returnVal[$count] = $char;
+		} else {
+		    $returnVal[$count] .= $char;
+		}
+		$newElem = 0;
+	    }
+	    $prevline = $currline if $currline =~ />/;
+	}
+}
+# check if last is biggest
+if (($maxreslength != 0) && ($maxreslength != $reslength)) {
+	return (undef, undef, undef, 0, $prevline);
+}
+#    $maxreslength = ($reslength > $maxreslength) ? $reslength : $maxreslength;
+# determine whether to use $NA or $AA
+if (!defined $input_kind) {
+	if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
+	    $kind = $NA;
+	} else {
+	    $kind = $AA;
+	}
+}
+return (\@returnVal, \@desc, $maxreslength || $reslength, 1, undef);  # 1 for good lengths
+}
+sub isSpace {
+return $_[0] =~ /[ \-]/;
+}
+sub isAA {
+return (defined $AMINOACIDS{lc $_[0]});
+}
+sub isNA {
+return (defined $BASES{lc $_[0]});
+}
+################################################################################
+######                       DATA BUILDING FUNCTIONS                      ######
+################################################################################
+# arguments: takes reference to array and lines aligned sequences of bases or
+#            amino acids
+# returns: updated reference array to reflect contents of sequences sorted
+#          vertically by height as described by (1)
+sub buildData {
+my $currentx = 0;
+my $h;
+my $count=0;
+my $maxreslength = pop (@_);
+my $stretch = pop(@_);
+my $smallsampletoggle = pop (@_);
+my $totalsize = $#_+1;
+while ($currentx < $maxreslength) {       #(length $_[0])) {
+	my $allspaces = 1;
+	my $spaceCount=0;
+	# get vertical sequence
+	my @vert=();
+	foreach (@_) {  # foreach sequence
+	    my $currentchar;
+	    # set currentchar, set to " " if $_ is not long enough
+	    if ($currentx >= (length $_)) {
+		$currentchar = " ";
+	    } else {
+		$currentchar = substr($_,$currentx,1);
+	    }
+	    # in all sequences, "-" is considered as a space
+	    # don't count " " and "-"
+	    if (($currentchar ne "-") && ($currentchar ne " ")) {
+		$vert[scalar @vert] = uc substr($_,$currentx,1);
+		$allspaces = 0;
+	    } else {
+		$spaceCount++;
+	    }
+	}
+	if ($allspaces) {
+	    # build @vert
+	    @vert = (" 0", ">0");
+	    # place in @data
+	    $data[scalar @data] = \@vert;
+	    $currentx++;
+	    next;
+	}
+	my $error;
+	if ($smallsampletoggle) {
+	    $error=getError($kind,$totalsize - $spaceCount);
+	} else {
+	    $error = 0;
+	}
+	# sort vertical sequence by amino acid or base
+	@vert = sort(@vert);
+	my $total = $#vert + 1;
+	# find H(l) -- must be done before collapsing
+	$h = getH(@vert);
+	# collect like terms
+	@vert = collapse(@vert);
+	# get R(l)
+	my $r;
+	if (!defined $stretch || !$stretch) {
+	    $r= getR($kind, $h, $error);
+	} else {
+	    $r = ($kind == $NA) ? 2 : (log(20) / log(2));
+	}
+	# place heights
+	my $count=0;
+	my $height;
+	my $elem;
+	foreach $elem (@vert) {
+	    $height = getHeight(substr($elem, 2) / $total,$r);
+	    $vert[$count] = substr($elem,0,1) . $height;
+	    $count++;
+	}
+	# sort by height
+	@vert = sort height_sort @vert;
+	# put in error bar size
+	$vert[$count] = ">$error";
+	# place in @data
+	$data[scalar @data] = \@vert;
+	$currentx++;
+}
+}
+# uses error approximation given by:
+#             e :=  (s-1) / (2 * ln2 * ntrue);
+sub getError {
+return ((($_[0] == $NA) ? 4 : 20) - 1) / (2 * log(2) * $_[1]);
+}
+sub height_sort {
+my ($lettera, $heighta) = ($a =~ /^(.{1})(\S+)$/); #substr($a,1);
+my ($letterb, $heightb) = ($b =~ /^(.{1})(\S+)$/); #substr($b,1);
+# compare by height first, then letter
+if ($heighta <=> $heightb) {
+	return $heighta <=> $heightb;
+} else {
+	return $letterb cmp $lettera;  #reversed for some reason...
+}
+}
+sub collapse {
+my @returnVal;
+my $current = $_[0];
+my $count=0;
+my $freq;
+foreach (@_) {
+if ($current eq $_) {
+$count++;
+} else {
+	    $returnVal[scalar @returnVal] = "$current $count";
+$current = $_;
+$count=1;
+}
+}
+# add last element
+$returnVal[scalar @returnVal] = "$current $count";
+return @returnVal;
+}
+# arguments : $_[0] : list of bases or amino acids
+sub getH {
+my $h = 0;
+my (@vert) = @_;  # vertical sequence (comparing multiple sequences)
+my $current = $vert[0];
+my $count=0;
+my $freq;
+foreach (@vert) {
+	if ($current eq $_) {
+	    $count++;
+	} else {
+	    $freq = $count / ($#vert + 1);
+	    $h += $freq * (log ($freq) / log(2));
+	    $current = $_;
+	    $count=1;
+	}
+}
+# add last element
+$freq = $count / ($#vert + 1);
+$h += $freq * (log ($freq) / log(2));
+return -$h;
+}
+# arguments : $_[0] : $AA or $NA
+#             $_[1] : uncertainty = H(l)
+#             $_[2] : error correction factor for small number of sequences
+#                     = e(n) ; assummed to be 0 if not given
+sub getR {
+my $max = ($_[0] == $NA) ? 2 : (log(20) / log(2));
+my $e = (defined $_[2]) ? $_[2] : 0;
+return ($max - ($_[1] + $e));
+}
+# arguments: $_[0] : frequency = f(b,l)
+#            $_[1] : R(l)
+sub getHeight {
+return $_[0] * $_[1]
+}
+1;

Mercurial > repos > davidvanzessen > mutation_analysis

comparison logo.pm @ 2:2f4298673519 draft