Mercurial > repos > devteam > indels_3way
comparison parseMAF_smallIndels.pl @ 0:79112aff0ccc draft default tip
Uploaded tool tarball.
| author | devteam |
|---|---|
| date | Tue, 20 Aug 2013 10:59:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:79112aff0ccc |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # a program to get indels | |
| 3 # input is a MAF format 3-way alignment file | |
| 4 # from 3-way blocks only at this time | |
| 5 # translate seq2, seq3, etc coordinates to + if align orient is reverse complement | |
| 6 | |
| 7 use strict; | |
| 8 use warnings; | |
| 9 | |
| 10 # declare and initialize variables | |
| 11 my $fh; # variable to store filehandle | |
| 12 my $record; | |
| 13 my $offset; | |
| 14 my $library = $ARGV[0]; | |
| 15 my $count = 0; | |
| 16 my $count2 = 0; | |
| 17 my $count3 = 0; | |
| 18 my $count4 = 0; | |
| 19 my $start1 = my $start2 = my $start3 = my $start4 = my $start5 = my $start6 = 0; | |
| 20 my $orient = ""; | |
| 21 my $outgroup = $ARGV[2]; | |
| 22 my $ingroup1 = my $ingroup2 = ""; | |
| 23 my $count_seq1insert = my $count_seq1delete = 0; | |
| 24 my $count_seq2insert = my $count_seq2delete = 0; | |
| 25 my $count_seq3insert = my $count_seq3delete = 0; | |
| 26 my @seq1_insert_lengths = my @seq1_delete_lengths = (); | |
| 27 my @seq2_insert_lengths = my @seq2_delete_lengths = (); | |
| 28 my @seq3_insert_lengths = my @seq3_delete_lengths = (); | |
| 29 my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = (); | |
| 30 my @seq1_insert_startOnly = my @seq1_delete_startOnly = my @seq2_insert_startOnly = my @seq2_delete_startOnly = (); | |
| 31 my @seq3_insert_startOnly = my @seq3_delete_startOnly = (); | |
| 32 my @indels = (); | |
| 33 | |
| 34 # check to make sure correct files | |
| 35 my $usage = "usage: parseMAF_smallIndels.pl [MAF.in] [small_Indels_summary.out] [outgroup]\n"; | |
| 36 die $usage unless @ARGV == 3; | |
| 37 | |
| 38 # perform some standard subroutines | |
| 39 $fh = open_file($library); | |
| 40 | |
| 41 $offset = tell($fh); | |
| 42 | |
| 43 #my $ofile = $ARGV[2]; | |
| 44 #unless (open(OFILE, ">$ofile")){ | |
| 45 # print "Cannot open output file \"$ofile\"\n\n"; | |
| 46 # exit; | |
| 47 #} | |
| 48 | |
| 49 my $ofile2 = $ARGV[1]; | |
| 50 unless (open(OFILE2, ">$ofile2")){ | |
| 51 print "Cannot open output file \"$ofile2\"\n\n"; | |
| 52 exit; | |
| 53 } | |
| 54 | |
| 55 | |
| 56 # header line for output files | |
| 57 #print OFILE "# small indel events, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; | |
| 58 #print OFILE "#align\tingroup1\tingroup1_coord\tingroup1_orient\tingroup2\tingroup2_coord\tingroup2_orient\toutgroup\toutgroup_coord\toutgroup_orient\tindel_type\n"; | |
| 59 | |
| 60 #print OFILE2 "# small indels summary, parsed from MAF 3-way alignment file, coords are translated from (-) to (+) if necessary\n"; | |
| 61 print OFILE2 "#block\tindel_type\tindel_length\tingroup1\tingroup1_start\tingroup1_end\tingroup1_alignSize\tingroup1_orient\tingroup2\tingroup2_start\tingroup2_end\tingroup2_alignSize\tingroup2_orient\toutgroup\toutgroup_start\toutgroup_end\toutgroup_alignSize\toutgroup_orient\n"; | |
| 62 | |
| 63 # main body of program | |
| 64 while ($record = get_next_record($fh) ){ | |
| 65 if ($record=~ m/\s*##maf(.*)\s*# maf/s){ | |
| 66 next; | |
| 67 } | |
| 68 | |
| 69 my @sequences = get_sequences_within_block($record); | |
| 70 my @seq_info = get_indels_within_block(@sequences); | |
| 71 get_indels_lengths(@seq_info); | |
| 72 | |
| 73 $offset = tell($fh); | |
| 74 $count++; | |
| 75 | |
| 76 } | |
| 77 | |
| 78 get_starts_only(@seq1_insert); | |
| 79 get_starts_only(@seq1_delete); | |
| 80 get_starts_only(@seq2_insert); | |
| 81 get_starts_only(@seq2_delete); | |
| 82 get_starts_only(@seq3_insert); | |
| 83 get_starts_only(@seq3_delete); | |
| 84 | |
| 85 # print some things to keep track of progress | |
| 86 #print "# $library\n"; | |
| 87 #print "# number of records = $count\n"; | |
| 88 #print "# number of sequence \"s\" lines = $count2\n"; | |
| 89 if ($count3 != 0){ | |
| 90 print "Skipped $count3 blocks with only 2 seqs;\n"; | |
| 91 } | |
| 92 #print "# number of records with only h-m = $count4\n\n"; | |
| 93 | |
| 94 print "Ingroup1 = $ingroup1; Ingroup2 = $ingroup2; Outgroup = $outgroup;\n"; | |
| 95 print "# of ingroup1 inserts = $count_seq1insert;\n"; | |
| 96 print "# of ingroup1 deletes = $count_seq1delete;\n"; | |
| 97 print "# of ingroup2 inserts = $count_seq2insert;\n"; | |
| 98 print "# of ingroup2 deletes = $count_seq2delete;\n"; | |
| 99 print "# of outgroup3 inserts = $count_seq3insert;\n"; | |
| 100 print "# of outgroup3 deletes = $count_seq3delete\n"; | |
| 101 | |
| 102 | |
| 103 #close OFILE; | |
| 104 | |
| 105 if ($count == $count3){ | |
| 106 print STDERR "Skipped all blocks since none of them contain 3-way alignments.\n"; | |
| 107 exit -1; | |
| 108 } | |
| 109 | |
| 110 ###################SUBROUTINES##################################### | |
| 111 | |
| 112 # subroutine to open file | |
| 113 sub open_file { | |
| 114 my($filename) = @_; | |
| 115 my $fh; | |
| 116 | |
| 117 unless (open($fh, $filename)){ | |
| 118 print "Cannot open file $filename\n"; | |
| 119 exit; | |
| 120 } | |
| 121 return $fh; | |
| 122 } | |
| 123 | |
| 124 # get next record | |
| 125 sub get_next_record { | |
| 126 my ($fh) = @_; | |
| 127 my ($offset); | |
| 128 my ($record) = ""; | |
| 129 my ($save_input_separator) = $/; | |
| 130 | |
| 131 $/ = "a score"; | |
| 132 | |
| 133 $record = <$fh>; | |
| 134 | |
| 135 $/ = $save_input_separator; | |
| 136 return $record; | |
| 137 } | |
| 138 | |
| 139 # get header info | |
| 140 sub get_sequences_within_block{ | |
| 141 my (@alignment) = @_; | |
| 142 my @lines = (); | |
| 143 | |
| 144 my @sequences = (); | |
| 145 | |
| 146 @lines = split ("\n", $record); | |
| 147 foreach (@lines){ | |
| 148 chomp($_); | |
| 149 if (m/^\s*$/){ | |
| 150 next; | |
| 151 } | |
| 152 elsif (m/^\s*=(\d+\.*\d*)/){ | |
| 153 | |
| 154 }elsif (m/^\s*s(.*)$/){ | |
| 155 $count2++; | |
| 156 | |
| 157 push (@sequences,$_); | |
| 158 } | |
| 159 } | |
| 160 return @sequences; | |
| 161 } | |
| 162 | |
| 163 sub get_indels_within_block{ | |
| 164 my (@sequences) = @_; | |
| 165 my $line1 = my $line2 = my $line3 = ""; | |
| 166 my @line1 = my @line2 = my @line3 = (); | |
| 167 my $score = 0; | |
| 168 my $start1 = my $align_length1 = my $end1 = my $seq_length1 = 0; | |
| 169 my $start2 = my $align_length2 = my $end2 = my $seq_length2 = 0; | |
| 170 my $start3 = my $align_length3 = my $end3 = my $seq_length3 = 0; | |
| 171 my $seq1 = my $orient1 = ""; | |
| 172 my $seq2 = my $orient2 = ""; | |
| 173 my $seq3 = my $orient3 = ""; | |
| 174 my $start1_plus = my $end1_plus = 0; | |
| 175 my $start2_plus = my $end2_plus = 0; | |
| 176 my $start3_plus = my $end3_plus = 0; | |
| 177 my @test = (); | |
| 178 my $test = ""; | |
| 179 my $header = ""; | |
| 180 my @header = (); | |
| 181 my $sequence1 = my $sequence2 = my $sequence3 =""; | |
| 182 my @array_return = (); | |
| 183 my $test1 = 0; | |
| 184 my $line1_stat = my $line2_stat = my $line3_stat = ""; | |
| 185 | |
| 186 # process 3-way blocks only | |
| 187 if (scalar(@sequences) == 3){ | |
| 188 $line1 = $sequences[0]; | |
| 189 chomp ($line1); | |
| 190 $line2 = $sequences[1]; | |
| 191 chomp ($line2); | |
| 192 $line3 = $sequences[2]; | |
| 193 chomp ($line3); | |
| 194 # check order of sequences and assign uniformly seq1= human, seq2 = chimp, seq3 = macaque | |
| 195 if ($line1 =~ m/$outgroup/){ | |
| 196 $line1_stat = "out"; | |
| 197 $line2=~ s/^\s*//; | |
| 198 $line2 =~ s/\s+/\t/g; | |
| 199 @line2 = split(/\t/, $line2); | |
| 200 if (($ingroup1 eq "") || ($line2[1] =~ m/$ingroup1/)){ | |
| 201 $line2_stat = "in1"; | |
| 202 $line3_stat = "in2"; | |
| 203 } | |
| 204 else{ | |
| 205 $line3_stat = "in1"; | |
| 206 $line2_stat = "in2"; } | |
| 207 } | |
| 208 elsif ($line2 =~ m/$outgroup/){ | |
| 209 $line2_stat = "out"; | |
| 210 $line1=~ s/^\s*//; | |
| 211 $line1 =~ s/\s+/\t/g; | |
| 212 @line1 = split(/\t/, $line1); | |
| 213 if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){ | |
| 214 $line1_stat = "in1"; | |
| 215 $line3_stat = "in2"; | |
| 216 } | |
| 217 else{ | |
| 218 $line3_stat = "in1"; | |
| 219 $line1_stat = "in2"; } | |
| 220 } | |
| 221 elsif ($line3 =~ m/$outgroup/){ | |
| 222 $line3_stat = "out"; | |
| 223 $line1=~ s/^\s*//; | |
| 224 $line1 =~ s/\s+/\t/g; | |
| 225 @line1 = split(/\t/, $line1); | |
| 226 if (($ingroup1 eq "") || ($line1[1] =~ m/$ingroup1/)){ | |
| 227 $line1_stat = "in1"; | |
| 228 $line2_stat = "in2"; | |
| 229 } | |
| 230 else{ | |
| 231 $line2_stat = "in1"; | |
| 232 $line1_stat = "in2"; } | |
| 233 } | |
| 234 | |
| 235 #print "# l1 = $line1_stat\n"; | |
| 236 #print "# l2 = $line2_stat\n"; | |
| 237 #print "# l3 = $line3_stat\n"; | |
| 238 my $linei1 = my $linei2 = my $lineo = ""; | |
| 239 my @linei1 = my @linei2 = my @lineo = (); | |
| 240 | |
| 241 if ($line1_stat eq "out"){ | |
| 242 $lineo = $line1; | |
| 243 } | |
| 244 elsif ($line1_stat eq "in1"){ | |
| 245 $linei1 = $line1; | |
| 246 } | |
| 247 else{ | |
| 248 $linei2 = $line1; | |
| 249 } | |
| 250 | |
| 251 if ($line2_stat eq "out"){ | |
| 252 $lineo = $line2; | |
| 253 } | |
| 254 elsif ($line2_stat eq "in1"){ | |
| 255 $linei1 = $line2; | |
| 256 } | |
| 257 else{ | |
| 258 $linei2 = $line2; | |
| 259 } | |
| 260 | |
| 261 if ($line3_stat eq "out"){ | |
| 262 $lineo = $line3; | |
| 263 } | |
| 264 elsif ($line3_stat eq "in1"){ | |
| 265 $linei1 = $line3; | |
| 266 } | |
| 267 else{ | |
| 268 $linei2 = $line3; | |
| 269 } | |
| 270 | |
| 271 $linei1=~ s/^\s*//; | |
| 272 $linei1 =~ s/\s+/\t/g; | |
| 273 @linei1 = split(/\t/, $linei1); | |
| 274 $end1 =($linei1[2]+$linei1[3]-1); | |
| 275 $seq1 = $linei1[1].":".$linei1[3]; | |
| 276 $ingroup1 = (split(/\./, $seq1))[0]; | |
| 277 $start1 = $linei1[2]; | |
| 278 $align_length1 = $linei1[3]; | |
| 279 $orient1 = $linei1[4]; | |
| 280 $seq_length1 = $linei1[5]; | |
| 281 $sequence1 = $linei1[6]; | |
| 282 $test1 = length($sequence1); | |
| 283 my $total_length1 = $test1+$start1; | |
| 284 my @array1 = ($start1,$end1,$orient1,$seq_length1); | |
| 285 ($start1_plus, $end1_plus) = convert_coords(@array1); | |
| 286 | |
| 287 $linei2=~ s/^\s*//; | |
| 288 $linei2 =~ s/\s+/\t/g; | |
| 289 @linei2 = split(/\t/, $linei2); | |
| 290 $end2 =($linei2[2]+$linei2[3]-1); | |
| 291 $seq2 = $linei2[1].":".$linei2[3]; | |
| 292 $ingroup2 = (split(/\./, $seq2))[0]; | |
| 293 $start2 = $linei2[2]; | |
| 294 $align_length2 = $linei2[3]; | |
| 295 $orient2 = $linei2[4]; | |
| 296 $seq_length2 = $linei2[5]; | |
| 297 $sequence2 = $linei2[6]; | |
| 298 my $test2 = length($sequence2); | |
| 299 my $total_length2 = $test2+$start2; | |
| 300 my @array2 = ($start2,$end2,$orient2,$seq_length2); | |
| 301 ($start2_plus, $end2_plus) = convert_coords(@array2); | |
| 302 | |
| 303 $lineo=~ s/^\s*//; | |
| 304 $lineo =~ s/\s+/\t/g; | |
| 305 @lineo = split(/\t/, $lineo); | |
| 306 $end3 =($lineo[2]+$lineo[3]-1); | |
| 307 $seq3 = $lineo[1].":".$lineo[3]; | |
| 308 $start3 = $lineo[2]; | |
| 309 $align_length3 = $lineo[3]; | |
| 310 $orient3 = $lineo[4]; | |
| 311 $seq_length3 = $lineo[5]; | |
| 312 $sequence3 = $lineo[6]; | |
| 313 my $test3 = length($sequence3); | |
| 314 my $total_length3 = $test3+$start3; | |
| 315 my @array3 = ($start3,$end3,$orient3,$seq_length3); | |
| 316 ($start3_plus, $end3_plus) = convert_coords(@array3); | |
| 317 | |
| 318 #print "# l1 = $ingroup1\n"; | |
| 319 #print "# l2 = $ingroup2\n"; | |
| 320 #print "# l3 = $outgroup\n"; | |
| 321 | |
| 322 my $ABC = ""; | |
| 323 my $coord1 = my $coord2 = my $coord3 = 0; | |
| 324 $coord1 = $start1_plus; | |
| 325 $coord2 = $start2_plus; | |
| 326 $coord3 = $start3_plus; | |
| 327 | |
| 328 for (my $position = 0; $position < $test1; $position++) { | |
| 329 my $indelType = ""; | |
| 330 my $indel_line = ""; | |
| 331 # seq1 deletes | |
| 332 if ((substr($sequence1,$position,1) eq "-") | |
| 333 && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) | |
| 334 && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ | |
| 335 $ABC = join("",($ABC,"X")); | |
| 336 my @s = split(/:/, $seq1); | |
| 337 $indelType = $s[0]."_delete"; | |
| 338 | |
| 339 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
| 340 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
| 341 push (@indels,$indel_line); | |
| 342 push (@seq1_delete,$indel_line); | |
| 343 $coord2++; $coord3++; | |
| 344 } | |
| 345 # seq2 deletes | |
| 346 elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) | |
| 347 && (substr($sequence2,$position,1) eq "-") | |
| 348 && (substr($sequence3,$position,1) !~ m/[-*\$?^]/)){ | |
| 349 $ABC = join("",($ABC,"Y")); | |
| 350 my @s = split(/:/, $seq2); | |
| 351 $indelType = $s[0]."_delete"; | |
| 352 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
| 353 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
| 354 push (@indels,$indel_line); | |
| 355 push (@seq2_delete,$indel_line); | |
| 356 $coord1++; | |
| 357 $coord3++; | |
| 358 | |
| 359 } | |
| 360 # seq1 inserts | |
| 361 elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) | |
| 362 && (substr($sequence2,$position,1) eq "-") | |
| 363 && (substr($sequence3,$position,1) eq "-")){ | |
| 364 $ABC = join("",($ABC,"Z")); | |
| 365 my @s = split(/:/, $seq1); | |
| 366 $indelType = $s[0]."_insert"; | |
| 367 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
| 368 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
| 369 push (@indels,$indel_line); | |
| 370 push (@seq1_insert,$indel_line); | |
| 371 $coord1++; | |
| 372 } | |
| 373 # seq2 inserts | |
| 374 elsif ((substr($sequence1,$position,1) eq "-") | |
| 375 && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) | |
| 376 && (substr($sequence3,$position,1) eq "-")){ | |
| 377 $ABC = join("",($ABC,"W")); | |
| 378 my @s = split(/:/, $seq2); | |
| 379 $indelType = $s[0]."_insert"; | |
| 380 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
| 381 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
| 382 push (@indels,$indel_line); | |
| 383 push (@seq2_insert,$indel_line); | |
| 384 $coord2++; | |
| 385 } | |
| 386 # seq3 deletes | |
| 387 elsif ((substr($sequence1,$position,1) !~ m/[-*\#$?^@]/) | |
| 388 && (substr($sequence2,$position,1) !~ m/[-*\#$?^@]/) | |
| 389 && (substr($sequence3,$position,1) eq "-")){ | |
| 390 $ABC = join("",($ABC,"S")); | |
| 391 my @s = split(/:/, $seq3); | |
| 392 $indelType = $s[0]."_delete"; | |
| 393 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
| 394 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
| 395 push (@indels,$indel_line); | |
| 396 push (@seq3_delete,$indel_line); | |
| 397 $coord1++; $coord2++; | |
| 398 } | |
| 399 # seq3 inserts | |
| 400 elsif ((substr($sequence1,$position,1) eq "-") | |
| 401 && (substr($sequence2,$position,1) eq "-") | |
| 402 && (substr($sequence3,$position,1) !~ m/[-*\#$?^@]/)){ | |
| 403 $ABC = join("",($ABC,"T")); | |
| 404 my @s = split(/:/, $seq3); | |
| 405 $indelType = $s[0]."_insert"; | |
| 406 #print OFILE "$count\t$seq1\t$coord1\t$orient1\t$seq2\t$coord2\t$orient2\t$seq3\t$coord3\t$orient3\t$indelType\n"; | |
| 407 $indel_line = join("\t",($count,$seq1,$coord1,$orient1,$seq2,$coord2,$orient2,$seq3,$coord3,$orient3,$indelType)); | |
| 408 push (@indels,$indel_line); | |
| 409 push (@seq3_insert,$indel_line); | |
| 410 $coord3++; | |
| 411 }else{ | |
| 412 $ABC = join("",($ABC,"N")); | |
| 413 $coord1++; $coord2++; $coord3++; | |
| 414 } | |
| 415 | |
| 416 } | |
| 417 @array_return=($seq1,$seq2,$seq3,$ABC); | |
| 418 return (@array_return); | |
| 419 | |
| 420 } | |
| 421 # ignore pairwise cases for now, just count the number of blocks | |
| 422 elsif (scalar(@sequences) == 2){ | |
| 423 my $ABC = ""; | |
| 424 my $coord1 = my $coord2 = my $coord3 = 0; | |
| 425 $count3++; | |
| 426 | |
| 427 $line1 = $sequences[0]; | |
| 428 $line2 = $sequences[1]; | |
| 429 chomp ($line1); | |
| 430 chomp ($line2); | |
| 431 | |
| 432 if ($line2 !~ m/$ingroup2/){ | |
| 433 $count4++; | |
| 434 } | |
| 435 } | |
| 436 } | |
| 437 | |
| 438 | |
| 439 sub get_indels_lengths{ | |
| 440 my (@array) = @_; | |
| 441 if (scalar(@array) == 4){ | |
| 442 my $seq1 = $array[0]; my $seq2 = $array[1]; my $seq3 = $array[2]; my $ABC = $array[3]; | |
| 443 | |
| 444 while ($ABC =~ m/(X+)/g) { | |
| 445 push (@seq1_delete_lengths,length($1)); | |
| 446 $count_seq1delete++; | |
| 447 } | |
| 448 | |
| 449 while ($ABC =~ m/(Y+)/g) { | |
| 450 push (@seq2_delete_lengths,length($1)); | |
| 451 $count_seq2delete++; | |
| 452 } | |
| 453 while ($ABC =~ m/(S+)/g) { | |
| 454 push (@seq3_delete_lengths,length($1)); | |
| 455 $count_seq3delete++; | |
| 456 } | |
| 457 while ($ABC =~ m/(Z+)/g) { | |
| 458 push (@seq1_insert_lengths,length($1)); | |
| 459 $count_seq1insert++; | |
| 460 } | |
| 461 while ($ABC =~ m/(W+)/g) { | |
| 462 push(@seq2_insert_lengths,length($1)); | |
| 463 $count_seq2insert++; | |
| 464 } | |
| 465 while ($ABC =~ m/(T+)/g) { | |
| 466 push (@seq3_insert_lengths,length($1)); | |
| 467 $count_seq3insert++; | |
| 468 } | |
| 469 } | |
| 470 elsif (scalar(@array) == 0){ | |
| 471 next; | |
| 472 } | |
| 473 | |
| 474 } | |
| 475 # convert to coordinates to + strand if align orient = - | |
| 476 sub convert_coords{ | |
| 477 my (@array) = @_; | |
| 478 my $s = $array[0]; | |
| 479 my $e = $array[1]; | |
| 480 my $o = $array[2]; | |
| 481 my $l = $array[3]; | |
| 482 my $start_plus = my $end_plus = 0; | |
| 483 | |
| 484 if ($o eq "-"){ | |
| 485 $start_plus = ($l - $e); | |
| 486 $end_plus = ($l - $s); | |
| 487 }elsif ($o eq "+"){ | |
| 488 $start_plus = $s; | |
| 489 $end_plus = $e; | |
| 490 } | |
| 491 | |
| 492 return ($start_plus, $end_plus); | |
| 493 } | |
| 494 | |
| 495 # get first line only for each event | |
| 496 sub get_starts_only{ | |
| 497 my (@test) = @_; | |
| 498 my $seq1 = my $seq2 = my $seq3 = my $indelType = my $old_seq1 = my $old_seq2 = my $old_seq3 = my $old_indelType = my $old_line = ""; | |
| 499 my $coord1 = my $coord2 = my $coord3 = my $old_coord1 = my $old_coord2 = my $old_coord3 = 0; | |
| 500 | |
| 501 my @matches = (); | |
| 502 my @seq1_insert = my @seq1_delete = my @seq2_insert = my @seq2_delete = my @seq3_insert = my @seq3_delete = (); | |
| 503 | |
| 504 | |
| 505 foreach my $line (@test){ | |
| 506 chomp($line); | |
| 507 $line =~ s/^\s*//; | |
| 508 $line =~ s/\s+/\t/g; | |
| 509 my @line1 = split(/\t/, $line); | |
| 510 $seq1 = $line1[1]; | |
| 511 $coord1 = $line1[2]; | |
| 512 $seq2 = $line1[4]; | |
| 513 $coord2 = $line1[5]; | |
| 514 $seq3 = $line1[7]; | |
| 515 $coord3 = $line1[8]; | |
| 516 $indelType = $line1[10]; | |
| 517 if ($indelType =~ m/$ingroup1/ && $indelType =~ m/insert/){ | |
| 518 if ($coord1 != $old_coord1+1 || ($coord2 != $old_coord2 || $coord3 != $old_coord3)){ | |
| 519 $start1++; | |
| 520 push (@seq1_insert_startOnly,$line); | |
| 521 } | |
| 522 } | |
| 523 elsif ($indelType =~ m/$ingroup1/ && $indelType =~ m/delete/){ | |
| 524 if ($coord1 != $old_coord1 || ($coord2 != $old_coord2+1 || $coord3 != $old_coord3+1)){ | |
| 525 $start2++; | |
| 526 push(@seq1_delete_startOnly,$line); | |
| 527 } | |
| 528 } | |
| 529 elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/insert/){ | |
| 530 if ($coord2 != $old_coord2+1 || ($coord1 != $old_coord1 || $coord3 != $old_coord3)){ | |
| 531 $start3++; | |
| 532 push(@seq2_insert_startOnly,$line); | |
| 533 } | |
| 534 } | |
| 535 elsif ($indelType =~ m/$ingroup2/ && $indelType =~ m/delete/){ | |
| 536 if ($coord2 != $old_coord2 || ($coord1 != $old_coord1+1 || $coord3 != $old_coord3+1)){ | |
| 537 $start4++; | |
| 538 push(@seq2_delete_startOnly,$line); | |
| 539 } | |
| 540 } | |
| 541 elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/insert/){ | |
| 542 if ($coord3 != $old_coord3+1 || ($coord1 != $old_coord1 || $coord2 != $old_coord2)){ | |
| 543 $start5++; | |
| 544 push(@seq3_insert_startOnly,$line); | |
| 545 } | |
| 546 } | |
| 547 elsif ($indelType =~ m/$outgroup/ && $indelType =~ m/delete/){ | |
| 548 if ($coord3 != $old_coord3 || ($coord1 != $old_coord1+1 ||$coord2 != $old_coord2+1)){ | |
| 549 $start6++; | |
| 550 push(@seq3_delete_startOnly,$line); | |
| 551 } | |
| 552 } | |
| 553 $old_indelType = $indelType; | |
| 554 $old_seq1 = $seq1; | |
| 555 $old_coord1 = $coord1; | |
| 556 $old_seq2 = $seq2; | |
| 557 $old_coord2 = $coord2; | |
| 558 $old_seq3 = $seq3; | |
| 559 $old_coord3 = $coord3; | |
| 560 $old_line = $line; | |
| 561 } | |
| 562 } | |
| 563 # append lengths to each event start line | |
| 564 my $counter1; my $counter2; my $counter3; my $counter4; my $counter5; my $counter6; | |
| 565 my @final1 = my @final2 = my @final3 = my @final4 = my @final5 = my @final6 = (); | |
| 566 my $final_line1 = my $final_line2 = my $final_line3 = my $final_line4 = my $final_line5 = my $final_line6 = ""; | |
| 567 | |
| 568 | |
| 569 for ($counter1 = 0; $counter1 < @seq3_insert_startOnly; $counter1++){ | |
| 570 $final_line1 = join("\t",($seq3_insert_startOnly[$counter1],$seq3_insert_lengths[$counter1])); | |
| 571 push (@final1,$final_line1); | |
| 572 } | |
| 573 | |
| 574 for ($counter2 = 0; $counter2 < @seq3_delete_startOnly; $counter2++){ | |
| 575 $final_line2 = join("\t",($seq3_delete_startOnly[$counter2],$seq3_delete_lengths[$counter2])); | |
| 576 push(@final2,$final_line2); | |
| 577 } | |
| 578 | |
| 579 for ($counter3 = 0; $counter3 < @seq2_insert_startOnly; $counter3++){ | |
| 580 $final_line3 = join("\t",($seq2_insert_startOnly[$counter3],$seq2_insert_lengths[$counter3])); | |
| 581 push(@final3,$final_line3); | |
| 582 } | |
| 583 | |
| 584 for ($counter4 = 0; $counter4 < @seq2_delete_startOnly; $counter4++){ | |
| 585 $final_line4 = join("\t",($seq2_delete_startOnly[$counter4],$seq2_delete_lengths[$counter4])); | |
| 586 push(@final4,$final_line4); | |
| 587 } | |
| 588 | |
| 589 for ($counter5 = 0; $counter5 < @seq1_insert_startOnly; $counter5++){ | |
| 590 $final_line5 = join("\t",($seq1_insert_startOnly[$counter5],$seq1_insert_lengths[$counter5])); | |
| 591 push(@final5,$final_line5); | |
| 592 } | |
| 593 | |
| 594 for ($counter6 = 0; $counter6 < @seq1_delete_startOnly; $counter6++){ | |
| 595 $final_line6 = join("\t",($seq1_delete_startOnly[$counter6],$seq1_delete_lengths[$counter6])); | |
| 596 push(@final6,$final_line6); | |
| 597 } | |
| 598 | |
| 599 # format final output | |
| 600 # # if inserts, increase coords for the sequence inserted, other sequences give coords for 5' and 3' bases flanking the gap | |
| 601 # # for deletes, increase coords for other 2 sequences and the one deleted give coords for 5' and 3' bases flanking the gap | |
| 602 | |
| 603 get_final_format(@final5); | |
| 604 get_final_format(@final6); | |
| 605 get_final_format(@final3); | |
| 606 get_final_format(@final4); | |
| 607 get_final_format(@final1); | |
| 608 get_final_format(@final2); | |
| 609 | |
| 610 sub get_final_format{ | |
| 611 my (@final) = @_; | |
| 612 foreach (@final){ | |
| 613 my $event_line = $_; | |
| 614 my @events = split(/\t/, $event_line); | |
| 615 my $event_type = $events[10]; | |
| 616 my @name_align1 = split(/:/, $events[1]); | |
| 617 my @name_align2 = split(/:/, $events[4]); | |
| 618 my @name_align3 = split(/:/, $events[7]); | |
| 619 my $seq1_event_start = my $seq1_event_end = my $seq2_event_start = my $seq2_event_end = my $seq3_event_start = my $seq3_event_end = 0; | |
| 620 my $final_event_line = ""; | |
| 621 # seq1_insert | |
| 622 if ($event_type =~ m/$ingroup1/ && $event_type =~ m/insert/){ | |
| 623 # only increase coord for human | |
| 624 # remember that other two sequnences, the gap spans (coord - 1) --> coord | |
| 625 $seq1_event_start = ($events[2]); | |
| 626 $seq1_event_end = ($events[2]+$events[11]-1); | |
| 627 $seq2_event_start = ($events[5]-1); | |
| 628 $seq2_event_end = ($events[5]); | |
| 629 $seq3_event_start = ($events[8]-1); | |
| 630 $seq3_event_end = ($events[8]); | |
| 631 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
| 632 } | |
| 633 # seq1_delete | |
| 634 elsif ($event_type =~ m/$ingroup1/ && $event_type =~ m/delete/){ | |
| 635 # only increase coords for seq2 and seq3 | |
| 636 # remember for seq1, the gap spans (coord - 1) --> coord | |
| 637 $seq1_event_start = ($events[2]-1); | |
| 638 $seq1_event_end = ($events[2]); | |
| 639 $seq2_event_start = ($events[5]); | |
| 640 $seq2_event_end = ($events[5]+$events[11]-1); | |
| 641 $seq3_event_start = ($events[8]); | |
| 642 $seq3_event_end = ($events[8]+$events[11]-1); | |
| 643 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
| 644 } | |
| 645 # seq2_insert | |
| 646 elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/insert/){ | |
| 647 # only increase coords for seq2 | |
| 648 # remember that other two sequnences, the gap spans (coord - 1) --> coord | |
| 649 $seq1_event_start = ($events[2]-1); | |
| 650 $seq1_event_end = ($events[2]); | |
| 651 $seq2_event_start = ($events[5]); | |
| 652 $seq2_event_end = ($events[5]+$events[11]-1); | |
| 653 $seq3_event_start = ($events[8]-1); | |
| 654 $seq3_event_end = ($events[8]); | |
| 655 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
| 656 } | |
| 657 # seq2_delete | |
| 658 elsif ($event_type =~ m/$ingroup2/ && $event_type =~ m/delete/){ | |
| 659 # only increase coords for seq1 and seq3 | |
| 660 # remember for seq2, the gap spans (coord - 1) --> coord | |
| 661 $seq1_event_start = ($events[2]); | |
| 662 $seq1_event_end = ($events[2]+$events[11]-1); | |
| 663 $seq2_event_start = ($events[5]-1); | |
| 664 $seq2_event_end = ($events[5]); | |
| 665 $seq3_event_start = ($events[8]); | |
| 666 $seq3_event_end = ($events[8]+$events[11]-1); | |
| 667 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
| 668 } | |
| 669 # start testing w/seq3_insert | |
| 670 elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/insert/){ | |
| 671 # only increase coord for rheMac | |
| 672 # remember that other two sequnences, the gap spans (coord - 1) --> coord | |
| 673 $seq1_event_start = ($events[2]-1); | |
| 674 $seq1_event_end = ($events[2]); | |
| 675 $seq2_event_start = ($events[5]-1); | |
| 676 $seq2_event_end = ($events[5]); | |
| 677 $seq3_event_start = ($events[8]); | |
| 678 $seq3_event_end = ($events[8]+$events[11]-1); | |
| 679 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
| 680 } | |
| 681 # seq3_delete | |
| 682 elsif ($event_type =~ m/$outgroup/ && $event_type =~ m/delete/){ | |
| 683 # only increase coords for seq1 and seq2 | |
| 684 # remember for seq3, the gap spans (coord - 1) --> coord | |
| 685 $seq1_event_start = ($events[2]); | |
| 686 $seq1_event_end = ($events[2]+$events[11]-1); | |
| 687 $seq2_event_start = ($events[5]); | |
| 688 $seq2_event_end = ($events[5]+$events[11]-1); | |
| 689 $seq3_event_start = ($events[8]-1); | |
| 690 $seq3_event_end = ($events[8]); | |
| 691 $final_event_line = join("\t",($events[0],$event_type,$events[11],$name_align1[0],$seq1_event_start,$seq1_event_end,$name_align1[1],$events[3],$name_align2[0],$seq2_event_start,$seq2_event_end,$name_align2[1],$events[6],$name_align3[0],$seq3_event_start,$seq3_event_end,$name_align3[1],$events[9])); | |
| 692 | |
| 693 } | |
| 694 | |
| 695 print OFILE2 "$final_event_line\n"; | |
| 696 } | |
| 697 } | |
| 698 close OFILE2; |
