Mercurial > repos > davidvanzessen > mutation_analysis
comparison logo.pm @ 2:2f4298673519 draft
Uploaded
| author | davidvanzessen |
|---|---|
| date | Wed, 10 Sep 2014 10:33:29 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:856b5b718d21 | 2:2f4298673519 |
|---|---|
| 1 #!/usr/local/bin/perl -w | |
| 2 | |
| 3 =head1 NAME | |
| 4 | |
| 5 logo.pm - organizes data in FASTA and CLUSTAL formats into height data. | |
| 6 | |
| 7 =head1 SYNOPSIS | |
| 8 | |
| 9 Perl module | |
| 10 | |
| 11 =head1 DESCRIPTION | |
| 12 | |
| 13 logo.pm: Takes in strings of aligned sequences and sorts them vertically | |
| 14 based on height as assigned by the following equations found in | |
| 15 Schneider and Stephens paper "Sequence Logos: A New Way to Display | |
| 16 Consensus Sequences": | |
| 17 | |
| 18 height = f(b,l) * R(l) (1) | |
| 19 | |
| 20 where f(b,l) is the frequency of base or amino acid "b" at position | |
| 21 "l". R(l) is amount of information present at position "l" and can | |
| 22 be quantified as follows: | |
| 23 | |
| 24 R(l) for amino acids = log(20) - (H(l) + e(n)) (2a) | |
| 25 R(l) for nucleic acids = 2 - (H(l) + e(n)) (2b) | |
| 26 | |
| 27 where log is taken base 2, H(l) is the uncertainty at position "l", | |
| 28 and e(n) is the error correction factor for small "n". H(l) is | |
| 29 computed as follows: | |
| 30 | |
| 31 H(l) = - (Sum f(b,l) * log[ f(b,l) ]) (3) | |
| 32 | |
| 33 where again, log is taken base 2. f(b,l) is the frequency of base | |
| 34 "b" at position "l". The sum is taken over all amino acids or | |
| 35 bases, depending on which the data is. | |
| 36 | |
| 37 Currently, logo.pm uses an approximation for e(n), given by: | |
| 38 | |
| 39 e(n) = (s-1) / (2 * ln2 * n) (4) | |
| 40 | |
| 41 Where s is 4 for nucleotides, 20 for amino acids ; n is the number | |
| 42 of sequences in the alignment. e(n) also gives the height of error | |
| 43 bars. | |
| 44 | |
| 45 =cut | |
| 46 | |
| 47 package logo; | |
| 48 | |
| 49 use strict; | |
| 50 use Carp; | |
| 51 | |
| 52 ################################################################################ | |
| 53 ###### SOME VARIABLES ###### | |
| 54 ################################################################################ | |
| 55 | |
| 56 my $DEBUG = 0; | |
| 57 | |
| 58 my $AA = 0; | |
| 59 my $NA = 1; | |
| 60 | |
| 61 my %BASES = ("a" => "adenine", | |
| 62 "t" => "thymine", | |
| 63 "g" => "guanine", | |
| 64 "c" => "cytosine", | |
| 65 "u" => "uracil"); | |
| 66 | |
| 67 # does not include B or Z | |
| 68 my %AMINOACIDS = ("a" => "", "c" => "", "d" => "", "e" => "", "f" => "", | |
| 69 "g" => "", "h" => "", "i" => "", "k" => "", "l" => "", | |
| 70 "m" => "", "n" => "", "p" => "", "q" => "", "r" => "", | |
| 71 "s" => "", "t" => "", "v" => "", "w" => "", "y" => ""); | |
| 72 | |
| 73 my @data; | |
| 74 my $kind; | |
| 75 my ($seqs_r, $desc_r); | |
| 76 | |
| 77 my $CONFIDENCE_LIMIT = 0.90; | |
| 78 | |
| 79 ################################################################################ | |
| 80 ###### SOME FUNCTIONS ###### | |
| 81 ################################################################################ | |
| 82 | |
| 83 =head1 APPENDIX | |
| 84 | |
| 85 =cut | |
| 86 | |
| 87 =head2 getHeightData() | |
| 88 | |
| 89 Usage : my ($height_data_r, $description_data_r, $kind) = | |
| 90 logo::getHeightData($input_data_r, $params); | |
| 91 Returns : * REFERENCE TO array of height data | |
| 92 * REFERENCE TO array of input description strings | |
| 93 * $AA if the data is amino acid, $NA otherise | |
| 94 Args : $input_data_r : input data in CLUSTAL or FASTA formats | |
| 95 : $params : hash of parameters | |
| 96 | |
| 97 getHeightData is the entry point into the logo.pm module. $input_data_r is a | |
| 98 reference to an array of strings containing FASTA or CLUSTAL data, where all | |
| 99 lines whose first character is "#" is considered a comment line. | |
| 100 | |
| 101 $params is a hash of parameters with the following keys: | |
| 102 * smallsampletoggle : 0 to turn off small sample correction, otherwise | |
| 103 small sample correction is on | |
| 104 * input_kind : 0 for amino acids, 1 for nucleic acids; if undefined, | |
| 105 logo.pm will attempt to automatically detect whether the | |
| 106 input consists of amino or nucleic acid data. If | |
| 107 $input_kind is defined, only those residues defined by | |
| 108 $input_kind will be in the output -- all other residues are | |
| 109 considered as spaces. For example, if $input_kind is $NA, | |
| 110 the residue "I" or "i" are considered spaces, since "I" and | |
| 111 "i" are not nucleic acid residues. | |
| 112 * stretch : stretch all characters so they are flush at the maximum number | |
| 113 of bits allowed | |
| 114 | |
| 115 Sample use: | |
| 116 | |
| 117 # get FASTA data | |
| 118 open (FASTA, "$fastafile"); | |
| 119 my @inputdata = <FASTA>; | |
| 120 close (FASTA); | |
| 121 | |
| 122 my %heightparams = ( | |
| 123 smallsamplecorrection => 0, | |
| 124 input_kind => 0, | |
| 125 stretch => 0 | |
| 126 ); | |
| 127 | |
| 128 # get height data | |
| 129 my ($heightdata_r, $desc_r, $kind) = logo::getHeightData(\@inputdata, \%heightparams); | |
| 130 | |
| 131 =cut | |
| 132 | |
| 133 # entry point into module | |
| 134 sub getHeightData { | |
| 135 | |
| 136 # $smallsampletoggle is toggle to turn off small sample correction (1 to turn off) | |
| 137 # $input_kind can be $AA or $NA or undef | |
| 138 my ($input_data_r, $params) = @_; | |
| 139 | |
| 140 # gary 040119: adjust for formats (Unix is \n, Mac is \r, Windows is \r\n) | |
| 141 $input_data_r = normalizeData($input_data_r); | |
| 142 | |
| 143 # gets sequences, sets $kind temporarily | |
| 144 my ($goodlength, $maxreslength, $badline, $validformat); | |
| 145 ($seqs_r, $desc_r, $maxreslength, $goodlength, $badline, $validformat) = | |
| 146 getSeqs($input_data_r, $params->{input_kind}); | |
| 147 | |
| 148 # for(my $i = 0; $i < scalar @$seqs_r ; $i++) { | |
| 149 # print STDERR ($desc_r->[$i] . "\n" . $seqs_r->[$i] . "\n"); | |
| 150 # } | |
| 151 # print STDERR "maxreslength = $maxreslength\n"; | |
| 152 # | |
| 153 # exit(0); | |
| 154 | |
| 155 if ($DEBUG) { print STDERR ("point 1\n");} | |
| 156 | |
| 157 # check for valid format | |
| 158 if ((defined $validformat) && ($validformat == 1)) { | |
| 159 # print("returning\n"); | |
| 160 return (undef, undef, undef, undef, undef, 1); | |
| 161 } | |
| 162 | |
| 163 if ($DEBUG) { print STDERR ("point 2\n");} | |
| 164 | |
| 165 # check for bad length | |
| 166 if (!$goodlength) { | |
| 167 return (undef, undef, undef, $goodlength, $badline); | |
| 168 } | |
| 169 | |
| 170 # reset $kind if in $input_kind | |
| 171 if (defined $params->{input_kind} && isLegalKind($params->{input_kind}) ) { | |
| 172 $kind = $params->{input_kind}; | |
| 173 } | |
| 174 | |
| 175 # build data | |
| 176 buildData(@$seqs_r, $params->{smallsampletoggle}, $params->{stretch}, $maxreslength); | |
| 177 | |
| 178 if ($DEBUG) { print STDERR ("point 3\n");} | |
| 179 | |
| 180 # print STDERR ("data size = ", scalar @data, "\n"); | |
| 181 # foreach (@data) { | |
| 182 # print STDERR ("$_\n"); | |
| 183 # } | |
| 184 # | |
| 185 # exit(0); | |
| 186 # | |
| 187 # print STDERR ("return at 2\n"); | |
| 188 return (\@data, $desc_r, $kind, $goodlength, $badline); | |
| 189 } | |
| 190 | |
| 191 sub isLegalKind { | |
| 192 return ($_[0] =~ /^[01]$/); | |
| 193 } | |
| 194 | |
| 195 ################################################################################ | |
| 196 # | |
| 197 # sub normalizeData($data_r) returns $data_r, with Mac/Unix/Windows newline | |
| 198 # style normalized to standard Unix-style newline style | |
| 199 # | |
| 200 ################################################################################ | |
| 201 sub normalizeData { | |
| 202 my ($data_r) = @_; | |
| 203 | |
| 204 # check args | |
| 205 if (not defined $data_r) { | |
| 206 die "data_r must be defined\n"; | |
| 207 } | |
| 208 | |
| 209 my @normalized = (); | |
| 210 foreach my $pseudo_line (@$data_r) { | |
| 211 my @split_line = split(/[\r\n]+/, $pseudo_line); | |
| 212 push(@normalized, @split_line); | |
| 213 } | |
| 214 | |
| 215 return \@normalized; | |
| 216 } | |
| 217 | |
| 218 | |
| 219 ################################################################################ | |
| 220 # | |
| 221 # sub getSeqs($data_r, $kind) returns 5 values: | |
| 222 # | |
| 223 # * array reference to sequence strings | |
| 224 # * array reference to sequence names | |
| 225 # * length of sequence | |
| 226 # * 1 if all sequences have the same length, 0 else | |
| 227 # * line number L where sequenceLength(L) != sequenceLength(other lines), else | |
| 228 # undef | |
| 229 # | |
| 230 ################################################################################ | |
| 231 sub getSeqs { | |
| 232 my ($input_data_r, $kind) = @_; | |
| 233 | |
| 234 unless( $input_data_r->[0] ){ | |
| 235 return (undef, undef, undef, undef, undef, 1); | |
| 236 } | |
| 237 | |
| 238 # skip all comment chars and lines of all spaces | |
| 239 while ( ($input_data_r->[0] =~ /^\s*\#/) || ($input_data_r->[0] =~ /^\s*$/) ) { | |
| 240 shift @$input_data_r; | |
| 241 if( !defined $input_data_r->[0]) {return (undef, undef, undef, undef, undef, 1);} | |
| 242 } | |
| 243 | |
| 244 if (isFormat_FASTA($input_data_r)) { | |
| 245 return getSeqs_FASTA($input_data_r, $kind); | |
| 246 | |
| 247 } elsif (isFormat_CLUSTAL($input_data_r)) { | |
| 248 return getSeqs_CLUSTAL($input_data_r, $kind); | |
| 249 | |
| 250 } elsif (isFormat_FLAT($input_data_r)) { | |
| 251 return getSeqs_FLAT($input_data_r, $kind); | |
| 252 | |
| 253 } else { | |
| 254 if ($DEBUG) {print STDERR ("format nothing\n");} | |
| 255 return (undef, undef, undef, undef, undef, 1); | |
| 256 } | |
| 257 | |
| 258 # if ($_[0] =~ />/) { | |
| 259 # return getSeqs_FASTA(@_); | |
| 260 # } else { | |
| 261 # return getSeqs_CLUSTAL(@_); | |
| 262 # } | |
| 263 } | |
| 264 | |
| 265 ################################################################################ | |
| 266 # | |
| 267 # sub isFormat_FASTA($data_r) returns 1 if $data_r is in FASTA format | |
| 268 # | |
| 269 ################################################################################s | |
| 270 sub isFormat_FASTA { | |
| 271 my ($input_data_r) = @_; | |
| 272 | |
| 273 # check args | |
| 274 if (not defined $input_data_r) { | |
| 275 Carp::confess("logo::isFormat_FASTA : input_data_r must be defined\n"); | |
| 276 } | |
| 277 | |
| 278 if ($input_data_r->[0] =~ />/) { | |
| 279 return 1; | |
| 280 } else { | |
| 281 return 0; | |
| 282 } | |
| 283 } | |
| 284 | |
| 285 ################################################################################ | |
| 286 # | |
| 287 # sub isFormat_CLUSTAL($data_r) returns 1 if $data_r is in CLUSTAL format | |
| 288 # | |
| 289 ################################################################################ | |
| 290 sub isFormat_CLUSTAL { | |
| 291 my ($input_data_r) = @_; | |
| 292 | |
| 293 # check args | |
| 294 if (not defined $input_data_r) { | |
| 295 Carp::confess("logo::isFormat_CLUSTAL : input_data_r must be " . | |
| 296 "defined\n"); | |
| 297 } | |
| 298 | |
| 299 my $i=0; | |
| 300 | |
| 301 # # skip spaces or just "*" and "." and ":" | |
| 302 # while ($input_data_r->[$i] =~ /^[\*\:\s]*$/) { | |
| 303 # $i++; | |
| 304 # } | |
| 305 | |
| 306 # if it looks like CLUSTAL W (version) ... , then it must be clustal | |
| 307 if ($input_data_r->[$i] =~ /^\s*CLUSTAL/) { | |
| 308 return 1; | |
| 309 } | |
| 310 | |
| 311 # CLUSTAL looks like: "name seq" | |
| 312 if ($input_data_r->[$i] =~ /^\s*(\S+)\s+(\S+)\s*$/) { | |
| 313 return 1; | |
| 314 } else { | |
| 315 return 0; | |
| 316 } | |
| 317 } | |
| 318 | |
| 319 ################################################################################ | |
| 320 # | |
| 321 # sub isFormat_FLAT($data_r) returns 1 if $data_r is in FLAT format | |
| 322 # | |
| 323 ################################################################################ | |
| 324 sub isFormat_FLAT { | |
| 325 my ($input_data_r) = @_; | |
| 326 | |
| 327 # check args | |
| 328 if (not defined $input_data_r) { | |
| 329 Carp::confess("logo::isFormat_FLAT : input_data_r must be defined\n"); | |
| 330 } | |
| 331 | |
| 332 # print("checking flat\n"); | |
| 333 # print("first element = -->", $input_data_r->[0], "<--\n"); | |
| 334 | |
| 335 if ($input_data_r->[0] =~ /^[a-zA-Z\-]+\s*$/) { | |
| 336 return 1; | |
| 337 } else { | |
| 338 return 0; | |
| 339 } | |
| 340 } | |
| 341 | |
| 342 ################################################################################ | |
| 343 ###### FORMATTING FUNCTIONS ###### | |
| 344 ################################################################################ | |
| 345 | |
| 346 # the flat sequence format is as follows: | |
| 347 # sequence1 | |
| 348 # sequence2 | |
| 349 # sequence3 | |
| 350 # ... | |
| 351 # sequenceN | |
| 352 sub getSeqs_FLAT { | |
| 353 | |
| 354 if ($DEBUG) {print STDERR "DOING FLAT\n";} | |
| 355 | |
| 356 my ($input_data_r, $input_kind) = @_; | |
| 357 | |
| 358 my $linelength = 0; | |
| 359 my $seqCount = 0; | |
| 360 my $total_residues = 0; | |
| 361 my (@returnVal, @desc) = (); | |
| 362 my $prevlinelength = undef; | |
| 363 my $NA_count = 0; | |
| 364 | |
| 365 foreach my $seq (@$input_data_r) { | |
| 366 # chomp $seq; | |
| 367 $seq =~ s/\s+$//; | |
| 368 | |
| 369 my @chars = split(//,$seq); | |
| 370 | |
| 371 my $char; | |
| 372 foreach (@chars) { | |
| 373 $total_residues++; | |
| 374 $linelength++; | |
| 375 | |
| 376 # set $char | |
| 377 if (defined $input_kind) { | |
| 378 if ($input_kind == $AA) { | |
| 379 $char = (isAA($_)) ? $_ : "-"; | |
| 380 } else { # == $NA | |
| 381 $char = (isNA($_)) ? $_ : "-"; | |
| 382 } | |
| 383 } else { | |
| 384 $char = $_; | |
| 385 if (isNA($char)) { | |
| 386 $NA_count++; | |
| 387 } | |
| 388 } | |
| 389 | |
| 390 $returnVal[$seqCount] .= $char; | |
| 391 } | |
| 392 $desc[$seqCount] = "no name"; | |
| 393 | |
| 394 if ($seqCount == 0) { | |
| 395 $prevlinelength = $linelength; | |
| 396 } elsif ($prevlinelength != $linelength) { # different number of residues, so complain | |
| 397 return (undef, undef, undef, 0, $seq); # 0 for not same length, $seq is name | |
| 398 } | |
| 399 $linelength=0; | |
| 400 | |
| 401 $seqCount++; | |
| 402 } | |
| 403 | |
| 404 # determine whether to use $NA or $AA | |
| 405 if (!defined $input_kind) { | |
| 406 if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { | |
| 407 $kind = $NA; | |
| 408 } else { | |
| 409 $kind = $AA; | |
| 410 } | |
| 411 } | |
| 412 | |
| 413 return (\@returnVal, \@desc, $prevlinelength, 1, undef); | |
| 414 } | |
| 415 | |
| 416 sub getSeqs_CLUSTAL { | |
| 417 | |
| 418 if ($DEBUG) {print STDERR "DOING CLUSTAL\n";} | |
| 419 | |
| 420 my ($input_data_r, $input_kind) = @_; | |
| 421 | |
| 422 my @returnVal; | |
| 423 my @desc; | |
| 424 my $seqCount=0; | |
| 425 my $reslength = 0; | |
| 426 my ($name, $seq); | |
| 427 | |
| 428 # my $input_kind = pop @_; | |
| 429 # my $CONFIDENCE_LIMIT = 0.90; | |
| 430 my $NA_count = 0; | |
| 431 my $total_residues = 0; | |
| 432 my ($prevlinelength, $linelength) = (0,0); | |
| 433 | |
| 434 # foreach (@_) { | |
| 435 foreach (@$input_data_r) { | |
| 436 # chomp; | |
| 437 | |
| 438 if ($DEBUG) {print STDERR ("line = $_\n")}; | |
| 439 | |
| 440 $_ =~ s/\s+$//; | |
| 441 | |
| 442 # skip if it is a comment character -- first character is "#" | |
| 443 next if (/^\s*\#/); | |
| 444 | |
| 445 # skil if it is a CLUSTAL W header line | |
| 446 next if (/^\s*CLUSTAL/); | |
| 447 | |
| 448 # if spaces or just "*" and "." and ":" | |
| 449 if (/^[\*\.\:\s]*$/) { | |
| 450 $seqCount=0; | |
| 451 $prevlinelength=0; | |
| 452 next; | |
| 453 } | |
| 454 | |
| 455 ($name,$seq) = (/^\s*(\S+)\s+(\S+)\s*$/); | |
| 456 | |
| 457 if ($DEBUG) { print STDERR ("name, seq = $name, $seq\n"); } | |
| 458 | |
| 459 # add new entry | |
| 460 if (!defined $desc[$seqCount]) { | |
| 461 $desc[$seqCount] = $name; | |
| 462 $returnVal[$seqCount] = ""; | |
| 463 } | |
| 464 | |
| 465 if(!defined $seq) {return (undef, undef, undef, undef, undef, 1);} # Something has gone terrible wrong | |
| 466 | |
| 467 my @chars = split(//,$seq); | |
| 468 my $char; | |
| 469 foreach (@chars) { | |
| 470 if ($seqCount == 0) { | |
| 471 $reslength++; # all sequences have same residue length, so only count first one | |
| 472 } | |
| 473 | |
| 474 $total_residues++; | |
| 475 $linelength++; | |
| 476 | |
| 477 # set $char | |
| 478 if (defined $input_kind) { | |
| 479 if ($input_kind == $AA) { | |
| 480 $char = (isAA($_)) ? $_ : "-"; | |
| 481 } else { # == $NA | |
| 482 $char = (isNA($_)) ? $_ : "-"; | |
| 483 } | |
| 484 } else { | |
| 485 $char = $_; | |
| 486 if (isNA($char)) { | |
| 487 $NA_count++; | |
| 488 } | |
| 489 } | |
| 490 | |
| 491 $returnVal[$seqCount] .= $char; | |
| 492 } | |
| 493 | |
| 494 if ($seqCount == 0) { | |
| 495 $prevlinelength = $linelength; | |
| 496 } elsif ($prevlinelength != $linelength) { # different number of residues, so complain | |
| 497 return (undef, undef, undef, 0, $name); | |
| 498 } | |
| 499 $linelength=0; | |
| 500 | |
| 501 $seqCount++; | |
| 502 } | |
| 503 | |
| 504 # determine whether to use $NA or $AA | |
| 505 if (!defined $input_kind ) { | |
| 506 if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { | |
| 507 $kind = $NA; | |
| 508 } else { | |
| 509 $kind = $AA; | |
| 510 } | |
| 511 } | |
| 512 | |
| 513 return (\@returnVal, \@desc, $reslength, 1, undef); | |
| 514 } | |
| 515 | |
| 516 # if $input_kind is defined, residues that are not defined are set to space | |
| 517 sub getSeqs_FASTA { | |
| 518 | |
| 519 if ($DEBUG) {print STDERR "DOING FASTA\n";} | |
| 520 | |
| 521 my ($input_data_r, $input_kind) = @_; | |
| 522 | |
| 523 my @returnVal; | |
| 524 my @desc; | |
| 525 my $count=-1; | |
| 526 my $newElem=0; | |
| 527 | |
| 528 # my $input_kind = pop @_; | |
| 529 | |
| 530 # my $CONFIDENCE_LIMIT = 0.90; | |
| 531 my $NA_count = 0; | |
| 532 my $total_residues = 0; | |
| 533 my $reslength = 0; | |
| 534 my $maxreslength = 0; | |
| 535 | |
| 536 my ($goodlength, $currline, $prevline); | |
| 537 | |
| 538 | |
| 539 # # skip all lines that are all spaces | |
| 540 # while ($_[0] =~ /^\s*$/) { | |
| 541 # shift @_; | |
| 542 # } | |
| 543 | |
| 544 # foreach (@_) { | |
| 545 foreach (@$input_data_r) { | |
| 546 | |
| 547 $_ =~ s/\s+$//; | |
| 548 | |
| 549 # skip if it is a comment character -- first character is "#" | |
| 550 next if (/^\s*\#/); | |
| 551 | |
| 552 # skip all lines that are all spaces | |
| 553 next if (/^\s*$/); | |
| 554 | |
| 555 $_ =~ s/\s+$//; # cut trailing white space | |
| 556 $_ =~ s/^\s+//; # cut leading white space | |
| 557 if (/>/) { | |
| 558 $currline = $_; | |
| 559 ($desc[scalar @desc]) = ($_ =~ />\s*(.+)$/); | |
| 560 | |
| 561 if (not $newElem) { | |
| 562 $count++; | |
| 563 $newElem = 1; | |
| 564 } | |
| 565 } else { | |
| 566 if ($newElem) { | |
| 567 $maxreslength = $reslength if $maxreslength == 0; | |
| 568 if (($maxreslength != 0) && ($maxreslength != $reslength)) { | |
| 569 return (undef, undef, undef, 0, $prevline); | |
| 570 } | |
| 571 | |
| 572 $maxreslength = $reslength; | |
| 573 $reslength = 0; | |
| 574 } | |
| 575 | |
| 576 my @chars = split(//,$_); | |
| 577 my $char; | |
| 578 foreach (@chars) { | |
| 579 $reslength++; | |
| 580 $total_residues++; | |
| 581 | |
| 582 # set $char | |
| 583 if (defined $input_kind) { | |
| 584 if ($input_kind == $AA) { | |
| 585 $char = (isAA($_)) ? $_ : "-"; | |
| 586 } else { # == $NA | |
| 587 $char = (isNA($_)) ? $_ : "-"; | |
| 588 } | |
| 589 } else { | |
| 590 $char = ($_ =~ /[a-zA-Z]/) ? $_ : "-"; # if not alpha char, use space | |
| 591 if (isNA($char) && !isSpace($char)) { | |
| 592 $NA_count++; | |
| 593 } | |
| 594 } | |
| 595 | |
| 596 if ($newElem) { | |
| 597 $returnVal[$count] = $char; | |
| 598 } else { | |
| 599 $returnVal[$count] .= $char; | |
| 600 } | |
| 601 $newElem = 0; | |
| 602 } | |
| 603 $prevline = $currline if $currline =~ />/; | |
| 604 } | |
| 605 } | |
| 606 | |
| 607 # check if last is biggest | |
| 608 if (($maxreslength != 0) && ($maxreslength != $reslength)) { | |
| 609 return (undef, undef, undef, 0, $prevline); | |
| 610 } | |
| 611 # $maxreslength = ($reslength > $maxreslength) ? $reslength : $maxreslength; | |
| 612 | |
| 613 # determine whether to use $NA or $AA | |
| 614 if (!defined $input_kind) { | |
| 615 if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) { | |
| 616 $kind = $NA; | |
| 617 } else { | |
| 618 $kind = $AA; | |
| 619 } | |
| 620 } | |
| 621 | |
| 622 return (\@returnVal, \@desc, $maxreslength || $reslength, 1, undef); # 1 for good lengths | |
| 623 } | |
| 624 | |
| 625 sub isSpace { | |
| 626 return $_[0] =~ /[ \-]/; | |
| 627 } | |
| 628 | |
| 629 sub isAA { | |
| 630 return (defined $AMINOACIDS{lc $_[0]}); | |
| 631 } | |
| 632 | |
| 633 sub isNA { | |
| 634 return (defined $BASES{lc $_[0]}); | |
| 635 } | |
| 636 | |
| 637 ################################################################################ | |
| 638 ###### DATA BUILDING FUNCTIONS ###### | |
| 639 ################################################################################ | |
| 640 | |
| 641 | |
| 642 # arguments: takes reference to array and lines aligned sequences of bases or | |
| 643 # amino acids | |
| 644 # returns: updated reference array to reflect contents of sequences sorted | |
| 645 # vertically by height as described by (1) | |
| 646 sub buildData { | |
| 647 | |
| 648 my $currentx = 0; | |
| 649 my $h; | |
| 650 my $count=0; | |
| 651 my $maxreslength = pop (@_); | |
| 652 my $stretch = pop(@_); | |
| 653 my $smallsampletoggle = pop (@_); | |
| 654 my $totalsize = $#_+1; | |
| 655 | |
| 656 while ($currentx < $maxreslength) { #(length $_[0])) { | |
| 657 my $allspaces = 1; | |
| 658 my $spaceCount=0; | |
| 659 | |
| 660 # get vertical sequence | |
| 661 my @vert=(); | |
| 662 foreach (@_) { # foreach sequence | |
| 663 my $currentchar; | |
| 664 | |
| 665 # set currentchar, set to " " if $_ is not long enough | |
| 666 if ($currentx >= (length $_)) { | |
| 667 $currentchar = " "; | |
| 668 } else { | |
| 669 $currentchar = substr($_,$currentx,1); | |
| 670 } | |
| 671 | |
| 672 # in all sequences, "-" is considered as a space | |
| 673 # don't count " " and "-" | |
| 674 if (($currentchar ne "-") && ($currentchar ne " ")) { | |
| 675 $vert[scalar @vert] = uc substr($_,$currentx,1); | |
| 676 $allspaces = 0; | |
| 677 } else { | |
| 678 $spaceCount++; | |
| 679 } | |
| 680 } | |
| 681 | |
| 682 if ($allspaces) { | |
| 683 # build @vert | |
| 684 @vert = (" 0", ">0"); | |
| 685 | |
| 686 # place in @data | |
| 687 $data[scalar @data] = \@vert; | |
| 688 | |
| 689 $currentx++; | |
| 690 next; | |
| 691 } | |
| 692 | |
| 693 my $error; | |
| 694 if ($smallsampletoggle) { | |
| 695 $error=getError($kind,$totalsize - $spaceCount); | |
| 696 } else { | |
| 697 $error = 0; | |
| 698 } | |
| 699 | |
| 700 # sort vertical sequence by amino acid or base | |
| 701 @vert = sort(@vert); | |
| 702 my $total = $#vert + 1; | |
| 703 | |
| 704 # find H(l) -- must be done before collapsing | |
| 705 $h = getH(@vert); | |
| 706 | |
| 707 # collect like terms | |
| 708 @vert = collapse(@vert); | |
| 709 | |
| 710 # get R(l) | |
| 711 my $r; | |
| 712 if (!defined $stretch || !$stretch) { | |
| 713 $r= getR($kind, $h, $error); | |
| 714 } else { | |
| 715 $r = ($kind == $NA) ? 2 : (log(20) / log(2)); | |
| 716 } | |
| 717 | |
| 718 # place heights | |
| 719 my $count=0; | |
| 720 my $height; | |
| 721 my $elem; | |
| 722 foreach $elem (@vert) { | |
| 723 $height = getHeight(substr($elem, 2) / $total,$r); | |
| 724 $vert[$count] = substr($elem,0,1) . $height; | |
| 725 $count++; | |
| 726 } | |
| 727 | |
| 728 # sort by height | |
| 729 @vert = sort height_sort @vert; | |
| 730 | |
| 731 # put in error bar size | |
| 732 $vert[$count] = ">$error"; | |
| 733 | |
| 734 # place in @data | |
| 735 $data[scalar @data] = \@vert; | |
| 736 | |
| 737 $currentx++; | |
| 738 } | |
| 739 } | |
| 740 | |
| 741 # uses error approximation given by: | |
| 742 # e := (s-1) / (2 * ln2 * ntrue); | |
| 743 sub getError { | |
| 744 return ((($_[0] == $NA) ? 4 : 20) - 1) / (2 * log(2) * $_[1]); | |
| 745 } | |
| 746 | |
| 747 sub height_sort { | |
| 748 my ($lettera, $heighta) = ($a =~ /^(.{1})(\S+)$/); #substr($a,1); | |
| 749 my ($letterb, $heightb) = ($b =~ /^(.{1})(\S+)$/); #substr($b,1); | |
| 750 | |
| 751 # compare by height first, then letter | |
| 752 if ($heighta <=> $heightb) { | |
| 753 return $heighta <=> $heightb; | |
| 754 } else { | |
| 755 return $letterb cmp $lettera; #reversed for some reason... | |
| 756 } | |
| 757 } | |
| 758 | |
| 759 sub collapse { | |
| 760 my @returnVal; | |
| 761 my $current = $_[0]; | |
| 762 my $count=0; | |
| 763 my $freq; | |
| 764 | |
| 765 foreach (@_) { | |
| 766 if ($current eq $_) { | |
| 767 $count++; | |
| 768 } else { | |
| 769 $returnVal[scalar @returnVal] = "$current $count"; | |
| 770 | |
| 771 $current = $_; | |
| 772 $count=1; | |
| 773 } | |
| 774 } | |
| 775 | |
| 776 # add last element | |
| 777 $returnVal[scalar @returnVal] = "$current $count"; | |
| 778 | |
| 779 return @returnVal; | |
| 780 } | |
| 781 | |
| 782 # arguments : $_[0] : list of bases or amino acids | |
| 783 sub getH { | |
| 784 my $h = 0; | |
| 785 my (@vert) = @_; # vertical sequence (comparing multiple sequences) | |
| 786 | |
| 787 my $current = $vert[0]; | |
| 788 my $count=0; | |
| 789 my $freq; | |
| 790 | |
| 791 foreach (@vert) { | |
| 792 if ($current eq $_) { | |
| 793 $count++; | |
| 794 } else { | |
| 795 $freq = $count / ($#vert + 1); | |
| 796 $h += $freq * (log ($freq) / log(2)); | |
| 797 | |
| 798 $current = $_; | |
| 799 $count=1; | |
| 800 } | |
| 801 } | |
| 802 | |
| 803 # add last element | |
| 804 $freq = $count / ($#vert + 1); | |
| 805 $h += $freq * (log ($freq) / log(2)); | |
| 806 | |
| 807 return -$h; | |
| 808 } | |
| 809 | |
| 810 # arguments : $_[0] : $AA or $NA | |
| 811 # $_[1] : uncertainty = H(l) | |
| 812 # $_[2] : error correction factor for small number of sequences | |
| 813 # = e(n) ; assummed to be 0 if not given | |
| 814 sub getR { | |
| 815 my $max = ($_[0] == $NA) ? 2 : (log(20) / log(2)); | |
| 816 my $e = (defined $_[2]) ? $_[2] : 0; | |
| 817 return ($max - ($_[1] + $e)); | |
| 818 } | |
| 819 | |
| 820 # arguments: $_[0] : frequency = f(b,l) | |
| 821 # $_[1] : R(l) | |
| 822 sub getHeight { | |
| 823 return $_[0] * $_[1] | |
| 824 } | |
| 825 | |
| 826 1; |
