| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 # parses ACE files of assembled repeats and detects potential
 | 
| 
 | 
     4 # LTR borders/insertion sites of LTR-retroelements
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 # "site" is a region (size of $window) including TG or CA
 | 
| 
 | 
     7 # "out" is a region adjacent to the site, presumably representing insertion sites
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"
 | 
| 
 | 
    10 # -m default set to 10 
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 use Getopt::Std;
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 getopt('iowsmdrp');
 | 
| 
 | 
    18 if ($opt_i) {
 | 
| 
 | 
    19 	$infile = $opt_i;
 | 
| 
 | 
    20 } else {
 | 
| 
 | 
    21 	die "-i input_file_name missing\n";
 | 
| 
 | 
    22 }
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 if ($opt_p) {
 | 
| 
 | 
    25     $db_PBS = $opt_p;
 | 
| 
 | 
    26 } else {
 | 
| 
 | 
    27     die "-p PBS database is missing\n";
 | 
| 
 | 
    28 }
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 if ($opt_o) {
 | 
| 
 | 
    34 	$outfile = $opt_o;
 | 
| 
 | 
    35 } else {
 | 
| 
 | 
    36 	die "-o output_file_name missing\n";
 | 
| 
 | 
    37 }
 | 
| 
 | 
    38 if ($opt_w) {
 | 
| 
 | 
    39 	$window = $opt_w;
 | 
| 
 | 
    40 } else {
 | 
| 
 | 
    41 	$window = 7;
 | 
| 
 | 
    42 	print "window size not set, using default ($window)\n";
 | 
| 
 | 
    43 }
 | 
| 
 | 
    44 if ($opt_s) {
 | 
| 
 | 
    45 	$min_site_depth = $opt_s;   # minimal average read depth (over $window) required for the site
 | 
| 
 | 
    46 } else {
 | 
| 
 | 
    47 	$min_site_depth = 10;
 | 
| 
 | 
    48 	print "min_site_depth not set, using default ($min_site_depth)\n";
 | 
| 
 | 
    49 }
 | 
| 
 | 
    50 if ($opt_m) {
 | 
| 
 | 
    51 	$min_out_masked = $opt_m;  # minimal average number of masked reads outside the site (over $window)
 | 
| 
 | 
    52 } else {
 | 
| 
 | 
    53 	$min_out_masked = 10;
 | 
| 
 | 
    54 	print "min_out_masked not set, using default ($min_out_masked)\n";
 | 
| 
 | 
    55 }
 | 
| 
 | 
    56 if ($opt_d) {
 | 
| 
 | 
    57 	$min_masked_fold_diff = $opt_d;  # how many times should the proportion of masked reads "out" be higher than in "site"
 | 
| 
 | 
    58 } else {
 | 
| 
 | 
    59 	$min_masked_fold_diff = 3;
 | 
| 
 | 
    60 	print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n";
 | 
| 
 | 
    61 }
 | 
| 
 | 
    62 if ($opt_x) {
 | 
| 
 | 
    63 	$max_char_to_masked = $opt_x;  # max fold difference between depth in "site" and masked depth "out"
 | 
| 
 | 
    64 } else {
 | 
| 
 | 
    65 	$max_char_to_masked = 10;
 | 
| 
 | 
    66 	print "max_char_to_masked not set, using default ($max_char_to_masked)\n"; 
 | 
| 
 | 
    67 }
 | 
| 
 | 
    68 if ($opt_r) {
 | 
| 
 | 
    69 	$extract_region = $opt_r;
 | 
| 
 | 
    70 } else {
 | 
| 
 | 
    71 	$extract_region = 30;
 | 
| 
 | 
    72 	print "extract_region not set, using default ($extract_region)\n";
 | 
| 
 | 
    73 }
 | 
| 
 | 
    74 
 | 
| 
 | 
    75 # main
 | 
| 
 | 
    76 $out_table = $outfile;
 | 
| 
 | 
    77 $out_LTR   = "$outfile.LTR";
 | 
| 
 | 
    78 $out_ADJ   = "$outfile.ADJ";
 | 
| 
 | 
    79 open (IN,$infile) or die;
 | 
| 
 | 
    80 open (OUT,">$out_table") or die;
 | 
| 
 | 
    81 open (LTR,">$out_LTR") or die;  # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)
 | 
| 
 | 
    82 open (ADJ,">$out_ADJ") or die;  # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region
 | 
| 
 | 
    83 print OUT "#Parameters:\n";
 | 
| 
 | 
    84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n";
 | 
| 
 | 
    85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n";
 | 
| 
 | 
    86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n";
 | 
| 
 | 
    87 print "Analyzing ACE file...\n";
 | 
| 
 | 
    88 $prev = 0;
 | 
| 
 | 
    89 while ($radek = <IN>) {
 | 
| 
 | 
    90 	$contig_found = &read_contig;
 | 
| 
 | 
    91 	if ($contig_found) {
 | 
| 
 | 
    92 		if ($cl > $prev) {
 | 
| 
 | 
    93 			$prev = $cl;
 | 
| 
 | 
    94 		}
 | 
| 
 | 
    95 		&reconstruct_assembly;
 | 
| 
 | 
    96 		&find_sites;
 | 
| 
 | 
    97 	}
 | 
| 
 | 
    98 }
 | 
| 
 | 
    99 close IN;
 | 
| 
 | 
   100 close OUT;
 | 
| 
 | 
   101 close LTR;
 | 
| 
 | 
   102 close ADJ;
 | 
| 
 | 
   103 print "Running blast against tRNA database...\n";
 | 
| 
 | 
   104 &add_PBS_info;    # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 $error = system("rm $out_table");
 | 
| 
 | 
   107 if ($error) {
 | 
| 
 | 
   108 	print "Error removing $out_table\n";
 | 
| 
 | 
   109 }
 | 
| 
 | 
   110 
 | 
| 
 | 
   111 sub read_contig {
 | 
| 
 | 
   112 	my ($reads_found,$read_id);
 | 
| 
 | 
   113 	# global variables
 | 
| 
 | 
   114 	$cl = 0;
 | 
| 
 | 
   115 	$contig = 0;
 | 
| 
 | 
   116 	$cont_length = 0;
 | 
| 
 | 
   117 	$reads = 0;   # number of reads
 | 
| 
 | 
   118 	$cons = "";   # contig consensus (including gaps *)
 | 
| 
 | 
   119 	%read_starts = ();  # starts of reads within assembly
 | 
| 
 | 
   120 	%read_lengths = (); # length of reads in assembly (may contain gaps)
 | 
| 
 | 
   121 	%read_from = ();    # start of non-masked part of read sequence (relative to the read)
 | 
| 
 | 
   122 	%read_to = ();      # end of non-masked part of read sequence   
 | 
| 
 | 
   123 	
 | 
| 
 | 
   124 	do {
 | 
| 
 | 
   125 		if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) {
 | 
| 
 | 
   126 			$cl = $1; $contig = $2; $cont_length = $3; $reads = $4;
 | 
| 
 | 
   127 			while ($radek = <IN> and length($radek) > 1) {
 | 
| 
 | 
   128 				chomp($radek);
 | 
| 
 | 
   129 				$cons .= $radek;
 | 
| 
 | 
   130 			}
 | 
| 
 | 
   131 			do {
 | 
| 
 | 
   132 				if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) {
 | 
| 
 | 
   133 					#print "$1 : $2\n";
 | 
| 
 | 
   134 					$read_starts{$1} = $2;
 | 
| 
 | 
   135 				}
 | 
| 
 | 
   136 			} while ($radek = <IN> and not $radek =~/^BS \d+ \d+/);
 | 
| 
 | 
   137 			$reads_found = 0;
 | 
| 
 | 
   138 			while ($reads_found < $reads) {        # expects previously specified number of reads 
 | 
| 
 | 
   139 				$radek = <IN>;              # expects RD lines with read ids alternate with QA lines
 | 
| 
 | 
   140 				if ($radek =~/^RD (\S+) (\d+)/) {
 | 
| 
 | 
   141 					$read_id = $1;
 | 
| 
 | 
   142 					$read_lengths{$read_id} = $2;
 | 
| 
 | 
   143 				}
 | 
| 
 | 
   144 				if ($radek =~/^QA (\d+) (\d+)/) {
 | 
| 
 | 
   145 					$read_from{$read_id} = $1;
 | 
| 
 | 
   146 					$read_to{$read_id} = $2;
 | 
| 
 | 
   147 					$reads_found++;
 | 
| 
 | 
   148 				}
 | 
| 
 | 
   149 			}
 | 
| 
 | 
   150 			return 1;
 | 
| 
 | 
   151 		}
 | 
| 
 | 
   152 	} while ($radek = <IN>);
 | 
| 
 | 
   153 	return 0;
 | 
| 
 | 
   154 }
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 sub reconstruct_assembly {
 | 
| 
 | 
   157 	my ($id,$min_start,$max_end,$shift,$f,$poz);
 | 
| 
 | 
   158 	# global variables
 | 
| 
 | 
   159 	@assembly_seq = ();     # sequence at each position of assembly; it corresponds to consensus, or regions
 | 
| 
 | 
   160 	                        # before (-) and after (+) consensus [assembly may be longer than consensus]
 | 
| 
 | 
   161 	@assembly_char = ();    # number of assembly positions with non-masked characters (nucleotides or gaps)
 | 
| 
 | 
   162 	@assembly_masked = ();  # number of masked positions
 | 
| 
 | 
   163 	$assembly_length = 0;   # total length, including - and + regions
 | 
| 
 | 
   164 	
 | 
| 
 | 
   165 	$min_start = 1; $max_end = 1;
 | 
| 
 | 
   166 	foreach $id (keys(%read_starts)) {
 | 
| 
 | 
   167 		if ($min_start > $read_starts{$id}) {
 | 
| 
 | 
   168 			$min_start = $read_starts{$id};
 | 
| 
 | 
   169 		}
 | 
| 
 | 
   170 		if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) {
 | 
| 
 | 
   171 			$max_end = $read_starts{$id}+$read_lengths{$id}-1;
 | 
| 
 | 
   172 		}
 | 
| 
 | 
   173 	}
 | 
| 
 | 
   174 	if ($min_start < 1) {
 | 
| 
 | 
   175 		$shift = abs($min_start) +1;
 | 
| 
 | 
   176 		$assembly_length = $shift + $max_end;
 | 
| 
 | 
   177 	} else {
 | 
| 
 | 
   178 		$assembly_length = $max_end;
 | 
| 
 | 
   179 		$shift = 0;
 | 
| 
 | 
   180 	}
 | 
| 
 | 
   181 	
 | 
| 
 | 
   182 	for ($f=1;$f<=$shift;$f++) {
 | 
| 
 | 
   183 		$assembly_seq[$f] = "-";
 | 
| 
 | 
   184 	}
 | 
| 
 | 
   185 	for ($f=1;$f<=$cont_length;$f++) {
 | 
| 
 | 
   186 		$assembly_seq[$f+$shift] = substr($cons,$f-1,1);
 | 
| 
 | 
   187 	}
 | 
| 
 | 
   188 	for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) {
 | 
| 
 | 
   189 		$assembly_seq[$f] = "+";
 | 
| 
 | 
   190 	}
 | 
| 
 | 
   191 
 | 
| 
 | 
   192 	for ($f=1;$f<=$assembly_length;$f++) {
 | 
| 
 | 
   193 		$assembly_char[$f] = 0;
 | 
| 
 | 
   194 		$assembly_masked[$f] = 0;
 | 
| 
 | 
   195 	}
 | 
| 
 | 
   196 	foreach $id (keys(%read_starts)) {
 | 
| 
 | 
   197 		for ($f=1;$f<=$read_lengths{$id};$f++) {
 | 
| 
 | 
   198 			$poz = $read_starts{$id} + $shift + $f - 1;
 | 
| 
 | 
   199 			if ($f>=$read_from{$id} and $f<=$read_to{$id}) {
 | 
| 
 | 
   200 				$assembly_char[$poz]++;
 | 
| 
 | 
   201 			} else {
 | 
| 
 | 
   202 				$assembly_masked[$poz]++;
 | 
| 
 | 
   203 			}
 | 
| 
 | 
   204 		}
 | 
| 
 | 
   205 	}
 | 
| 
 | 
   206 }
 | 
| 
 | 
   207 
 | 
| 
 | 
   208 sub revcompl {
 | 
| 
 | 
   209 	my ($input) = @_;
 | 
| 
 | 
   210 	my ($base,$seq,$f);
 | 
| 
 | 
   211 	
 | 
| 
 | 
   212 	$seq = "";
 | 
| 
 | 
   213 	for ($f=length($input)-1;$f>=0;$f--) {
 | 
| 
 | 
   214 		$base = substr($input,$f,1);
 | 
| 
 | 
   215 		if ($base eq "A") {
 | 
| 
 | 
   216 			$seq .= "T";
 | 
| 
 | 
   217 		} elsif ($base eq "T") {
 | 
| 
 | 
   218 			$seq .= "A";
 | 
| 
 | 
   219 		} elsif ($base eq "G") {
 | 
| 
 | 
   220 			$seq .= "C";
 | 
| 
 | 
   221 		} elsif ($base eq "C") {
 | 
| 
 | 
   222 			$seq .= "G";
 | 
| 
 | 
   223 		} elsif ($base eq "+" or $base eq "-") {
 | 
| 
 | 
   224 			$seq .= $base;
 | 
| 
 | 
   225 		} else {
 | 
| 
 | 
   226 			$seq .= "N";
 | 
| 
 | 
   227 		}
 | 
| 
 | 
   228 	}
 | 
| 
 | 
   229 	return $seq;
 | 
| 
 | 
   230 }
 | 
| 
 | 
   231 
 | 
| 
 | 
   232 sub find_sites {
 | 
| 
 | 
   233 	my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos);
 | 
| 
 | 
   234 	my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region);
 | 
| 
 | 
   235 	
 | 
| 
 | 
   236 	# find positions of LTR borders (TG and CA)
 | 
| 
 | 
   237 	@TG = ();   # positions of "T"
 | 
| 
 | 
   238 	@CA = ();   # positions of "A"
 | 
| 
 | 
   239 	for ($f=1;$f<$assembly_length;$f++) {
 | 
| 
 | 
   240 		if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") {
 | 
| 
 | 
   241 			push(@TG,$f);
 | 
| 
 | 
   242 		}
 | 
| 
 | 
   243 		if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") {
 | 
| 
 | 
   244 			push(@CA,$f+1);
 | 
| 
 | 
   245 		}
 | 
| 
 | 
   246 	}
 | 
| 
 | 
   247 	
 | 
| 
 | 
   248 	foreach $pos (@TG) {
 | 
| 
 | 
   249 		if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) {
 | 
| 
 | 
   250 			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
 | 
| 
 | 
   251 			for ($f=$pos;$f<$pos+$window;$f++) {
 | 
| 
 | 
   252 				$site_sum_char += $assembly_char[$f];
 | 
| 
 | 
   253 				$site_sum_masked += $assembly_masked[$f];
 | 
| 
 | 
   254 				$site_seq .= $assembly_seq[$f];
 | 
| 
 | 
   255 			}
 | 
| 
 | 
   256 			$out_sum_char = 0; $out_sum_masked = 0;
 | 
| 
 | 
   257 			for ($f=$pos-$window;$f<$pos;$f++) {
 | 
| 
 | 
   258 				$out_sum_char += $assembly_char[$f];
 | 
| 
 | 
   259 				$out_sum_masked += $assembly_masked[$f];
 | 
| 
 | 
   260 			}
 | 
| 
 | 
   261 			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site
 | 
| 
 | 
   262 			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site
 | 
| 
 | 
   263 			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
 | 
| 
 | 
   264 			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
 | 
| 
 | 
   265 			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
 | 
| 
 | 
   266 				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
 | 
| 
 | 
   267 					print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
 | 
| 
 | 
   268 					$region = "";
 | 
| 
 | 
   269 					for ($f=$pos;$f<=$assembly_length;$f++) {
 | 
| 
 | 
   270 						if ($assembly_seq[$f] ne "*") {
 | 
| 
 | 
   271 							$region .= $assembly_seq[$f];
 | 
| 
 | 
   272 						}
 | 
| 
 | 
   273 						if (length($region) == $extract_region) {
 | 
| 
 | 
   274 							$f = $assembly_length;  # terminate cycle
 | 
| 
 | 
   275 						}
 | 
| 
 | 
   276 					}
 | 
| 
 | 
   277 					print OUT "$region\t";
 | 
| 
 | 
   278 					print LTR ">CL",$cl,"c".$contig."_TG_$pos\n";
 | 
| 
 | 
   279 					$region = &revcompl($region);
 | 
| 
 | 
   280 					print LTR "$region\n";
 | 
| 
 | 
   281 					$region = "";
 | 
| 
 | 
   282 					for ($f=$pos-1;$f>0;$f=$f-1) {
 | 
| 
 | 
   283 						if ($assembly_seq[$f] ne "*") {
 | 
| 
 | 
   284 							$region = $assembly_seq[$f].$region;
 | 
| 
 | 
   285 						}
 | 
| 
 | 
   286 						if (length($region) == $extract_region) {
 | 
| 
 | 
   287 							$f = 0;  # terminate cycle
 | 
| 
 | 
   288 						}
 | 
| 
 | 
   289 					}
 | 
| 
 | 
   290 					print OUT "$region\n";
 | 
| 
 | 
   291 					print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n";
 | 
| 
 | 
   292 					$region = &revcompl($region);
 | 
| 
 | 
   293 					print ADJ "$region\n";
 | 
| 
 | 
   294 				}
 | 
| 
 | 
   295 			}
 | 
| 
 | 
   296 		}
 | 
| 
 | 
   297 	}
 | 
| 
 | 
   298 	
 | 
| 
 | 
   299 	foreach $pos (@CA) {
 | 
| 
 | 
   300 		if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {
 | 
| 
 | 
   301 			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
 | 
| 
 | 
   302 			for ($f=$pos-$window+1;$f<=$pos;$f++) {
 | 
| 
 | 
   303 				$site_sum_char += $assembly_char[$f];
 | 
| 
 | 
   304 				$site_sum_masked += $assembly_masked[$f];
 | 
| 
 | 
   305 				$site_seq .= $assembly_seq[$f];
 | 
| 
 | 
   306 			}
 | 
| 
 | 
   307 			$out_sum_char = 0; $out_sum_masked = 0;
 | 
| 
 | 
   308 			for ($f=$pos+1;$f<=$pos+$window;$f++) {
 | 
| 
 | 
   309 				$out_sum_char += $assembly_char[$f];
 | 
| 
 | 
   310 				$out_sum_masked += $assembly_masked[$f];
 | 
| 
 | 
   311 			}
 | 
| 
 | 
   312 			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site
 | 
| 
 | 
   313 			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site
 | 
| 
 | 
   314 			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
 | 
| 
 | 
   315 			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
 | 
| 
 | 
   316 			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
 | 
| 
 | 
   317 				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
 | 
| 
 | 
   318 					print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
 | 
| 
 | 
   319 					$region = "";
 | 
| 
 | 
   320 					for ($f=$pos;$f>0;$f=$f-1) {
 | 
| 
 | 
   321 						if ($assembly_seq[$f] ne "*") {
 | 
| 
 | 
   322 							$region = $assembly_seq[$f].$region;
 | 
| 
 | 
   323 						}
 | 
| 
 | 
   324 						if (length($region) == $extract_region) {
 | 
| 
 | 
   325 							$f = 0;  # terminate cycle
 | 
| 
 | 
   326 						}
 | 
| 
 | 
   327 					}
 | 
| 
 | 
   328 					print OUT "$region\t";
 | 
| 
 | 
   329 					print LTR ">CL",$cl,"c".$contig."_CA_$pos\n";
 | 
| 
 | 
   330 					print LTR "$region\n";
 | 
| 
 | 
   331 					$region = "";
 | 
| 
 | 
   332 					for ($f=$pos+1;$f<=$assembly_length;$f++) {
 | 
| 
 | 
   333 						if ($assembly_seq[$f] ne "*") {
 | 
| 
 | 
   334 							$region .= $assembly_seq[$f];
 | 
| 
 | 
   335 						}
 | 
| 
 | 
   336 						if (length($region) == $extract_region) {
 | 
| 
 | 
   337 							$f = $assembly_length;  # terminate cycle
 | 
| 
 | 
   338 						}
 | 
| 
 | 
   339 					}
 | 
| 
 | 
   340 					print OUT "$region\n";
 | 
| 
 | 
   341 					print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n";
 | 
| 
 | 
   342 					print ADJ "$region\n";
 | 
| 
 | 
   343 				}
 | 
| 
 | 
   344 			}
 | 
| 
 | 
   345 		}
 | 
| 
 | 
   346 	}
 | 
| 
 | 
   347 }
 | 
| 
 | 
   348 
 | 
| 
 | 
   349 sub add_PBS_info {
 | 
| 
 | 
   350 	my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);
 | 
| 
 | 
   351 	
 | 
| 
 | 
   352 	$pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";
 | 
| 
 | 
   353 	
 | 
| 
 | 
   354 	@table = ();
 | 
| 
 | 
   355 	open (TAB,$out_table) or die;
 | 
| 
 | 
   356 	while ($rad = <TAB>) {
 | 
| 
 | 
   357 		push(@table,$rad);
 | 
| 
 | 
   358 		$tab_length++;
 | 
| 
 | 
   359 	}
 | 
| 
 | 
   360 	close TAB;
 | 
| 
 | 
   361 	
 | 
| 
 | 
   362 	open (BLAST,"$pbs_blast_command |") or die;
 | 
| 
 | 
   363 	$prev_query = "";
 | 
| 
 | 
   364 	while ($rad = <BLAST>) {
 | 
| 
 | 
   365 		if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) {   
 | 
| 
 | 
   366 			if ("$1\t$2\t$3\t$4" ne $prev_query) {           # to exclude additional HSPs from the same query/subject pair
 | 
| 
 | 
   367 				for ($f=0;$f<$tab_length;$f++) {       
 | 
| 
 | 
   368 					@pol = split(/\t/,$table[$f]);
 | 
| 
 | 
   369 					if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {
 | 
| 
 | 
   370 						chomp($table[$f]);
 | 
| 
 | 
   371 						$table[$f] .= "\t$rad";
 | 
| 
 | 
   372 						$f = $tab_length;  # terminate cycle
 | 
| 
 | 
   373 					}
 | 
| 
 | 
   374 				}
 | 
| 
 | 
   375 				$prev_query = "$1\t$2\t$3\t$4";
 | 
| 
 | 
   376 			}
 | 
| 
 | 
   377 		}
 | 
| 
 | 
   378 	}
 | 
| 
 | 
   379 	close BLAST;
 | 
| 
 | 
   380 	
 | 
| 
 | 
   381 	open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;
 | 
| 
 | 
   382 	for ($f=0;$f<$tab_length;$f++) {
 | 
| 
 | 
   383 		print TAB_WITH_BLAST $table[$f];
 | 
| 
 | 
   384 	}
 | 
| 
 | 
   385 	close TAB_WITH_BLAST;
 | 
| 
 | 
   386 }
 | 
| 
 | 
   387 
 | 
| 
 | 
   388 
 | 
| 
 | 
   389 
 | 
| 
 | 
   390 
 |