| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 # parses ACE files of assembled repeats and detects potential | 
|  | 4 # LTR borders/insertion sites of LTR-retroelements | 
|  | 5 | 
|  | 6 # "site" is a region (size of $window) including TG or CA | 
|  | 7 # "out" is a region adjacent to the site, presumably representing insertion sites | 
|  | 8 | 
|  | 9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl" | 
|  | 10 # -m default set to 10 | 
|  | 11 | 
|  | 12 use Getopt::Std; | 
|  | 13 | 
|  | 14 | 
|  | 15 | 
|  | 16 | 
|  | 17 getopt('iowsmdrp'); | 
|  | 18 if ($opt_i) { | 
|  | 19 	$infile = $opt_i; | 
|  | 20 } else { | 
|  | 21 	die "-i input_file_name missing\n"; | 
|  | 22 } | 
|  | 23 | 
|  | 24 if ($opt_p) { | 
|  | 25     $db_PBS = $opt_p; | 
|  | 26 } else { | 
|  | 27     die "-p PBS database is missing\n"; | 
|  | 28 } | 
|  | 29 | 
|  | 30 | 
|  | 31 | 
|  | 32 | 
|  | 33 if ($opt_o) { | 
|  | 34 	$outfile = $opt_o; | 
|  | 35 } else { | 
|  | 36 	die "-o output_file_name missing\n"; | 
|  | 37 } | 
|  | 38 if ($opt_w) { | 
|  | 39 	$window = $opt_w; | 
|  | 40 } else { | 
|  | 41 	$window = 7; | 
|  | 42 	print "window size not set, using default ($window)\n"; | 
|  | 43 } | 
|  | 44 if ($opt_s) { | 
|  | 45 	$min_site_depth = $opt_s;   # minimal average read depth (over $window) required for the site | 
|  | 46 } else { | 
|  | 47 	$min_site_depth = 10; | 
|  | 48 	print "min_site_depth not set, using default ($min_site_depth)\n"; | 
|  | 49 } | 
|  | 50 if ($opt_m) { | 
|  | 51 	$min_out_masked = $opt_m;  # minimal average number of masked reads outside the site (over $window) | 
|  | 52 } else { | 
|  | 53 	$min_out_masked = 10; | 
|  | 54 	print "min_out_masked not set, using default ($min_out_masked)\n"; | 
|  | 55 } | 
|  | 56 if ($opt_d) { | 
|  | 57 	$min_masked_fold_diff = $opt_d;  # how many times should the proportion of masked reads "out" be higher than in "site" | 
|  | 58 } else { | 
|  | 59 	$min_masked_fold_diff = 3; | 
|  | 60 	print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n"; | 
|  | 61 } | 
|  | 62 if ($opt_x) { | 
|  | 63 	$max_char_to_masked = $opt_x;  # max fold difference between depth in "site" and masked depth "out" | 
|  | 64 } else { | 
|  | 65 	$max_char_to_masked = 10; | 
|  | 66 	print "max_char_to_masked not set, using default ($max_char_to_masked)\n"; | 
|  | 67 } | 
|  | 68 if ($opt_r) { | 
|  | 69 	$extract_region = $opt_r; | 
|  | 70 } else { | 
|  | 71 	$extract_region = 30; | 
|  | 72 	print "extract_region not set, using default ($extract_region)\n"; | 
|  | 73 } | 
|  | 74 | 
|  | 75 # main | 
|  | 76 $out_table = $outfile; | 
|  | 77 $out_LTR   = "$outfile.LTR"; | 
|  | 78 $out_ADJ   = "$outfile.ADJ"; | 
|  | 79 open (IN,$infile) or die; | 
|  | 80 open (OUT,">$out_table") or die; | 
|  | 81 open (LTR,">$out_LTR") or die;  # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented) | 
|  | 82 open (ADJ,">$out_ADJ") or die;  # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region | 
|  | 83 print OUT "#Parameters:\n"; | 
|  | 84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n"; | 
|  | 85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n"; | 
|  | 86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n"; | 
|  | 87 print "Analyzing ACE file...\n"; | 
|  | 88 $prev = 0; | 
|  | 89 while ($radek = <IN>) { | 
|  | 90 	$contig_found = &read_contig; | 
|  | 91 	if ($contig_found) { | 
|  | 92 		if ($cl > $prev) { | 
|  | 93 			$prev = $cl; | 
|  | 94 		} | 
|  | 95 		&reconstruct_assembly; | 
|  | 96 		&find_sites; | 
|  | 97 	} | 
|  | 98 } | 
|  | 99 close IN; | 
|  | 100 close OUT; | 
|  | 101 close LTR; | 
|  | 102 close ADJ; | 
|  | 103 print "Running blast against tRNA database...\n"; | 
|  | 104 &add_PBS_info;    # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!) | 
|  | 105 | 
|  | 106 $error = system("rm $out_table"); | 
|  | 107 if ($error) { | 
|  | 108 	print "Error removing $out_table\n"; | 
|  | 109 } | 
|  | 110 | 
|  | 111 sub read_contig { | 
|  | 112 	my ($reads_found,$read_id); | 
|  | 113 	# global variables | 
|  | 114 	$cl = 0; | 
|  | 115 	$contig = 0; | 
|  | 116 	$cont_length = 0; | 
|  | 117 	$reads = 0;   # number of reads | 
|  | 118 	$cons = "";   # contig consensus (including gaps *) | 
|  | 119 	%read_starts = ();  # starts of reads within assembly | 
|  | 120 	%read_lengths = (); # length of reads in assembly (may contain gaps) | 
|  | 121 	%read_from = ();    # start of non-masked part of read sequence (relative to the read) | 
|  | 122 	%read_to = ();      # end of non-masked part of read sequence | 
|  | 123 | 
|  | 124 	do { | 
|  | 125 		if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) { | 
|  | 126 			$cl = $1; $contig = $2; $cont_length = $3; $reads = $4; | 
|  | 127 			while ($radek = <IN> and length($radek) > 1) { | 
|  | 128 				chomp($radek); | 
|  | 129 				$cons .= $radek; | 
|  | 130 			} | 
|  | 131 			do { | 
|  | 132 				if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) { | 
|  | 133 					#print "$1 : $2\n"; | 
|  | 134 					$read_starts{$1} = $2; | 
|  | 135 				} | 
|  | 136 			} while ($radek = <IN> and not $radek =~/^BS \d+ \d+/); | 
|  | 137 			$reads_found = 0; | 
|  | 138 			while ($reads_found < $reads) {        # expects previously specified number of reads | 
|  | 139 				$radek = <IN>;              # expects RD lines with read ids alternate with QA lines | 
|  | 140 				if ($radek =~/^RD (\S+) (\d+)/) { | 
|  | 141 					$read_id = $1; | 
|  | 142 					$read_lengths{$read_id} = $2; | 
|  | 143 				} | 
|  | 144 				if ($radek =~/^QA (\d+) (\d+)/) { | 
|  | 145 					$read_from{$read_id} = $1; | 
|  | 146 					$read_to{$read_id} = $2; | 
|  | 147 					$reads_found++; | 
|  | 148 				} | 
|  | 149 			} | 
|  | 150 			return 1; | 
|  | 151 		} | 
|  | 152 	} while ($radek = <IN>); | 
|  | 153 	return 0; | 
|  | 154 } | 
|  | 155 | 
|  | 156 sub reconstruct_assembly { | 
|  | 157 	my ($id,$min_start,$max_end,$shift,$f,$poz); | 
|  | 158 	# global variables | 
|  | 159 	@assembly_seq = ();     # sequence at each position of assembly; it corresponds to consensus, or regions | 
|  | 160 	                        # before (-) and after (+) consensus [assembly may be longer than consensus] | 
|  | 161 	@assembly_char = ();    # number of assembly positions with non-masked characters (nucleotides or gaps) | 
|  | 162 	@assembly_masked = ();  # number of masked positions | 
|  | 163 	$assembly_length = 0;   # total length, including - and + regions | 
|  | 164 | 
|  | 165 	$min_start = 1; $max_end = 1; | 
|  | 166 	foreach $id (keys(%read_starts)) { | 
|  | 167 		if ($min_start > $read_starts{$id}) { | 
|  | 168 			$min_start = $read_starts{$id}; | 
|  | 169 		} | 
|  | 170 		if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) { | 
|  | 171 			$max_end = $read_starts{$id}+$read_lengths{$id}-1; | 
|  | 172 		} | 
|  | 173 	} | 
|  | 174 	if ($min_start < 1) { | 
|  | 175 		$shift = abs($min_start) +1; | 
|  | 176 		$assembly_length = $shift + $max_end; | 
|  | 177 	} else { | 
|  | 178 		$assembly_length = $max_end; | 
|  | 179 		$shift = 0; | 
|  | 180 	} | 
|  | 181 | 
|  | 182 	for ($f=1;$f<=$shift;$f++) { | 
|  | 183 		$assembly_seq[$f] = "-"; | 
|  | 184 	} | 
|  | 185 	for ($f=1;$f<=$cont_length;$f++) { | 
|  | 186 		$assembly_seq[$f+$shift] = substr($cons,$f-1,1); | 
|  | 187 	} | 
|  | 188 	for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) { | 
|  | 189 		$assembly_seq[$f] = "+"; | 
|  | 190 	} | 
|  | 191 | 
|  | 192 	for ($f=1;$f<=$assembly_length;$f++) { | 
|  | 193 		$assembly_char[$f] = 0; | 
|  | 194 		$assembly_masked[$f] = 0; | 
|  | 195 	} | 
|  | 196 	foreach $id (keys(%read_starts)) { | 
|  | 197 		for ($f=1;$f<=$read_lengths{$id};$f++) { | 
|  | 198 			$poz = $read_starts{$id} + $shift + $f - 1; | 
|  | 199 			if ($f>=$read_from{$id} and $f<=$read_to{$id}) { | 
|  | 200 				$assembly_char[$poz]++; | 
|  | 201 			} else { | 
|  | 202 				$assembly_masked[$poz]++; | 
|  | 203 			} | 
|  | 204 		} | 
|  | 205 	} | 
|  | 206 } | 
|  | 207 | 
|  | 208 sub revcompl { | 
|  | 209 	my ($input) = @_; | 
|  | 210 	my ($base,$seq,$f); | 
|  | 211 | 
|  | 212 	$seq = ""; | 
|  | 213 	for ($f=length($input)-1;$f>=0;$f--) { | 
|  | 214 		$base = substr($input,$f,1); | 
|  | 215 		if ($base eq "A") { | 
|  | 216 			$seq .= "T"; | 
|  | 217 		} elsif ($base eq "T") { | 
|  | 218 			$seq .= "A"; | 
|  | 219 		} elsif ($base eq "G") { | 
|  | 220 			$seq .= "C"; | 
|  | 221 		} elsif ($base eq "C") { | 
|  | 222 			$seq .= "G"; | 
|  | 223 		} elsif ($base eq "+" or $base eq "-") { | 
|  | 224 			$seq .= $base; | 
|  | 225 		} else { | 
|  | 226 			$seq .= "N"; | 
|  | 227 		} | 
|  | 228 	} | 
|  | 229 	return $seq; | 
|  | 230 } | 
|  | 231 | 
|  | 232 sub find_sites { | 
|  | 233 	my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos); | 
|  | 234 	my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region); | 
|  | 235 | 
|  | 236 	# find positions of LTR borders (TG and CA) | 
|  | 237 	@TG = ();   # positions of "T" | 
|  | 238 	@CA = ();   # positions of "A" | 
|  | 239 	for ($f=1;$f<$assembly_length;$f++) { | 
|  | 240 		if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") { | 
|  | 241 			push(@TG,$f); | 
|  | 242 		} | 
|  | 243 		if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") { | 
|  | 244 			push(@CA,$f+1); | 
|  | 245 		} | 
|  | 246 	} | 
|  | 247 | 
|  | 248 	foreach $pos (@TG) { | 
|  | 249 		if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) { | 
|  | 250 			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = ""; | 
|  | 251 			for ($f=$pos;$f<$pos+$window;$f++) { | 
|  | 252 				$site_sum_char += $assembly_char[$f]; | 
|  | 253 				$site_sum_masked += $assembly_masked[$f]; | 
|  | 254 				$site_seq .= $assembly_seq[$f]; | 
|  | 255 			} | 
|  | 256 			$out_sum_char = 0; $out_sum_masked = 0; | 
|  | 257 			for ($f=$pos-$window;$f<$pos;$f++) { | 
|  | 258 				$out_sum_char += $assembly_char[$f]; | 
|  | 259 				$out_sum_masked += $assembly_masked[$f]; | 
|  | 260 			} | 
|  | 261 			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site | 
|  | 262 			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site | 
|  | 263 			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char)); | 
|  | 264 			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char)); | 
|  | 265 			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) { | 
|  | 266 				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) { | 
|  | 267 					print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t"; | 
|  | 268 					$region = ""; | 
|  | 269 					for ($f=$pos;$f<=$assembly_length;$f++) { | 
|  | 270 						if ($assembly_seq[$f] ne "*") { | 
|  | 271 							$region .= $assembly_seq[$f]; | 
|  | 272 						} | 
|  | 273 						if (length($region) == $extract_region) { | 
|  | 274 							$f = $assembly_length;  # terminate cycle | 
|  | 275 						} | 
|  | 276 					} | 
|  | 277 					print OUT "$region\t"; | 
|  | 278 					print LTR ">CL",$cl,"c".$contig."_TG_$pos\n"; | 
|  | 279 					$region = &revcompl($region); | 
|  | 280 					print LTR "$region\n"; | 
|  | 281 					$region = ""; | 
|  | 282 					for ($f=$pos-1;$f>0;$f=$f-1) { | 
|  | 283 						if ($assembly_seq[$f] ne "*") { | 
|  | 284 							$region = $assembly_seq[$f].$region; | 
|  | 285 						} | 
|  | 286 						if (length($region) == $extract_region) { | 
|  | 287 							$f = 0;  # terminate cycle | 
|  | 288 						} | 
|  | 289 					} | 
|  | 290 					print OUT "$region\n"; | 
|  | 291 					print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n"; | 
|  | 292 					$region = &revcompl($region); | 
|  | 293 					print ADJ "$region\n"; | 
|  | 294 				} | 
|  | 295 			} | 
|  | 296 		} | 
|  | 297 	} | 
|  | 298 | 
|  | 299 	foreach $pos (@CA) { | 
|  | 300 		if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) { | 
|  | 301 			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = ""; | 
|  | 302 			for ($f=$pos-$window+1;$f<=$pos;$f++) { | 
|  | 303 				$site_sum_char += $assembly_char[$f]; | 
|  | 304 				$site_sum_masked += $assembly_masked[$f]; | 
|  | 305 				$site_seq .= $assembly_seq[$f]; | 
|  | 306 			} | 
|  | 307 			$out_sum_char = 0; $out_sum_masked = 0; | 
|  | 308 			for ($f=$pos+1;$f<=$pos+$window;$f++) { | 
|  | 309 				$out_sum_char += $assembly_char[$f]; | 
|  | 310 				$out_sum_masked += $assembly_masked[$f]; | 
|  | 311 			} | 
|  | 312 			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site | 
|  | 313 			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site | 
|  | 314 			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char)); | 
|  | 315 			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char)); | 
|  | 316 			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) { | 
|  | 317 				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) { | 
|  | 318 					print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t"; | 
|  | 319 					$region = ""; | 
|  | 320 					for ($f=$pos;$f>0;$f=$f-1) { | 
|  | 321 						if ($assembly_seq[$f] ne "*") { | 
|  | 322 							$region = $assembly_seq[$f].$region; | 
|  | 323 						} | 
|  | 324 						if (length($region) == $extract_region) { | 
|  | 325 							$f = 0;  # terminate cycle | 
|  | 326 						} | 
|  | 327 					} | 
|  | 328 					print OUT "$region\t"; | 
|  | 329 					print LTR ">CL",$cl,"c".$contig."_CA_$pos\n"; | 
|  | 330 					print LTR "$region\n"; | 
|  | 331 					$region = ""; | 
|  | 332 					for ($f=$pos+1;$f<=$assembly_length;$f++) { | 
|  | 333 						if ($assembly_seq[$f] ne "*") { | 
|  | 334 							$region .= $assembly_seq[$f]; | 
|  | 335 						} | 
|  | 336 						if (length($region) == $extract_region) { | 
|  | 337 							$f = $assembly_length;  # terminate cycle | 
|  | 338 						} | 
|  | 339 					} | 
|  | 340 					print OUT "$region\n"; | 
|  | 341 					print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n"; | 
|  | 342 					print ADJ "$region\n"; | 
|  | 343 				} | 
|  | 344 			} | 
|  | 345 		} | 
|  | 346 	} | 
|  | 347 } | 
|  | 348 | 
|  | 349 sub add_PBS_info { | 
|  | 350 	my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length); | 
|  | 351 | 
|  | 352 	$pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F"; | 
|  | 353 | 
|  | 354 	@table = (); | 
|  | 355 	open (TAB,$out_table) or die; | 
|  | 356 	while ($rad = <TAB>) { | 
|  | 357 		push(@table,$rad); | 
|  | 358 		$tab_length++; | 
|  | 359 	} | 
|  | 360 	close TAB; | 
|  | 361 | 
|  | 362 	open (BLAST,"$pbs_blast_command |") or die; | 
|  | 363 	$prev_query = ""; | 
|  | 364 	while ($rad = <BLAST>) { | 
|  | 365 		if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) { | 
|  | 366 			if ("$1\t$2\t$3\t$4" ne $prev_query) {           # to exclude additional HSPs from the same query/subject pair | 
|  | 367 				for ($f=0;$f<$tab_length;$f++) { | 
|  | 368 					@pol = split(/\t/,$table[$f]); | 
|  | 369 					if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") { | 
|  | 370 						chomp($table[$f]); | 
|  | 371 						$table[$f] .= "\t$rad"; | 
|  | 372 						$f = $tab_length;  # terminate cycle | 
|  | 373 					} | 
|  | 374 				} | 
|  | 375 				$prev_query = "$1\t$2\t$3\t$4"; | 
|  | 376 			} | 
|  | 377 		} | 
|  | 378 	} | 
|  | 379 	close BLAST; | 
|  | 380 | 
|  | 381 	open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die; | 
|  | 382 	for ($f=0;$f<$tab_length;$f++) { | 
|  | 383 		print TAB_WITH_BLAST $table[$f]; | 
|  | 384 	} | 
|  | 385 	close TAB_WITH_BLAST; | 
|  | 386 } | 
|  | 387 | 
|  | 388 | 
|  | 389 | 
|  | 390 |