0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 # parses ACE files of assembled repeats and detects potential
|
|
4 # LTR borders/insertion sites of LTR-retroelements
|
|
5
|
|
6 # "site" is a region (size of $window) including TG or CA
|
|
7 # "out" is a region adjacent to the site, presumably representing insertion sites
|
|
8
|
|
9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"
|
|
10 # -m default set to 10
|
|
11
|
|
12 use Getopt::Std;
|
|
13
|
|
14
|
|
15
|
|
16
|
|
17 getopt('iowsmdrp');
|
|
18 if ($opt_i) {
|
|
19 $infile = $opt_i;
|
|
20 } else {
|
|
21 die "-i input_file_name missing\n";
|
|
22 }
|
|
23
|
|
24 if ($opt_p) {
|
|
25 $db_PBS = $opt_p;
|
|
26 } else {
|
|
27 die "-p PBS database is missing\n";
|
|
28 }
|
|
29
|
|
30
|
|
31
|
|
32
|
|
33 if ($opt_o) {
|
|
34 $outfile = $opt_o;
|
|
35 } else {
|
|
36 die "-o output_file_name missing\n";
|
|
37 }
|
|
38 if ($opt_w) {
|
|
39 $window = $opt_w;
|
|
40 } else {
|
|
41 $window = 7;
|
|
42 print "window size not set, using default ($window)\n";
|
|
43 }
|
|
44 if ($opt_s) {
|
|
45 $min_site_depth = $opt_s; # minimal average read depth (over $window) required for the site
|
|
46 } else {
|
|
47 $min_site_depth = 10;
|
|
48 print "min_site_depth not set, using default ($min_site_depth)\n";
|
|
49 }
|
|
50 if ($opt_m) {
|
|
51 $min_out_masked = $opt_m; # minimal average number of masked reads outside the site (over $window)
|
|
52 } else {
|
|
53 $min_out_masked = 10;
|
|
54 print "min_out_masked not set, using default ($min_out_masked)\n";
|
|
55 }
|
|
56 if ($opt_d) {
|
|
57 $min_masked_fold_diff = $opt_d; # how many times should the proportion of masked reads "out" be higher than in "site"
|
|
58 } else {
|
|
59 $min_masked_fold_diff = 3;
|
|
60 print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n";
|
|
61 }
|
|
62 if ($opt_x) {
|
|
63 $max_char_to_masked = $opt_x; # max fold difference between depth in "site" and masked depth "out"
|
|
64 } else {
|
|
65 $max_char_to_masked = 10;
|
|
66 print "max_char_to_masked not set, using default ($max_char_to_masked)\n";
|
|
67 }
|
|
68 if ($opt_r) {
|
|
69 $extract_region = $opt_r;
|
|
70 } else {
|
|
71 $extract_region = 30;
|
|
72 print "extract_region not set, using default ($extract_region)\n";
|
|
73 }
|
|
74
|
|
75 # main
|
|
76 $out_table = $outfile;
|
|
77 $out_LTR = "$outfile.LTR";
|
|
78 $out_ADJ = "$outfile.ADJ";
|
|
79 open (IN,$infile) or die;
|
|
80 open (OUT,">$out_table") or die;
|
|
81 open (LTR,">$out_LTR") or die; # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)
|
|
82 open (ADJ,">$out_ADJ") or die; # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region
|
|
83 print OUT "#Parameters:\n";
|
|
84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n";
|
|
85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n";
|
|
86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n";
|
|
87 print "Analyzing ACE file...\n";
|
|
88 $prev = 0;
|
|
89 while ($radek = <IN>) {
|
|
90 $contig_found = &read_contig;
|
|
91 if ($contig_found) {
|
|
92 if ($cl > $prev) {
|
|
93 $prev = $cl;
|
|
94 }
|
|
95 &reconstruct_assembly;
|
|
96 &find_sites;
|
|
97 }
|
|
98 }
|
|
99 close IN;
|
|
100 close OUT;
|
|
101 close LTR;
|
|
102 close ADJ;
|
|
103 print "Running blast against tRNA database...\n";
|
|
104 &add_PBS_info; # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)
|
|
105
|
|
106 $error = system("rm $out_table");
|
|
107 if ($error) {
|
|
108 print "Error removing $out_table\n";
|
|
109 }
|
|
110
|
|
111 sub read_contig {
|
|
112 my ($reads_found,$read_id);
|
|
113 # global variables
|
|
114 $cl = 0;
|
|
115 $contig = 0;
|
|
116 $cont_length = 0;
|
|
117 $reads = 0; # number of reads
|
|
118 $cons = ""; # contig consensus (including gaps *)
|
|
119 %read_starts = (); # starts of reads within assembly
|
|
120 %read_lengths = (); # length of reads in assembly (may contain gaps)
|
|
121 %read_from = (); # start of non-masked part of read sequence (relative to the read)
|
|
122 %read_to = (); # end of non-masked part of read sequence
|
|
123
|
|
124 do {
|
|
125 if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) {
|
|
126 $cl = $1; $contig = $2; $cont_length = $3; $reads = $4;
|
|
127 while ($radek = <IN> and length($radek) > 1) {
|
|
128 chomp($radek);
|
|
129 $cons .= $radek;
|
|
130 }
|
|
131 do {
|
|
132 if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) {
|
|
133 #print "$1 : $2\n";
|
|
134 $read_starts{$1} = $2;
|
|
135 }
|
|
136 } while ($radek = <IN> and not $radek =~/^BS \d+ \d+/);
|
|
137 $reads_found = 0;
|
|
138 while ($reads_found < $reads) { # expects previously specified number of reads
|
|
139 $radek = <IN>; # expects RD lines with read ids alternate with QA lines
|
|
140 if ($radek =~/^RD (\S+) (\d+)/) {
|
|
141 $read_id = $1;
|
|
142 $read_lengths{$read_id} = $2;
|
|
143 }
|
|
144 if ($radek =~/^QA (\d+) (\d+)/) {
|
|
145 $read_from{$read_id} = $1;
|
|
146 $read_to{$read_id} = $2;
|
|
147 $reads_found++;
|
|
148 }
|
|
149 }
|
|
150 return 1;
|
|
151 }
|
|
152 } while ($radek = <IN>);
|
|
153 return 0;
|
|
154 }
|
|
155
|
|
156 sub reconstruct_assembly {
|
|
157 my ($id,$min_start,$max_end,$shift,$f,$poz);
|
|
158 # global variables
|
|
159 @assembly_seq = (); # sequence at each position of assembly; it corresponds to consensus, or regions
|
|
160 # before (-) and after (+) consensus [assembly may be longer than consensus]
|
|
161 @assembly_char = (); # number of assembly positions with non-masked characters (nucleotides or gaps)
|
|
162 @assembly_masked = (); # number of masked positions
|
|
163 $assembly_length = 0; # total length, including - and + regions
|
|
164
|
|
165 $min_start = 1; $max_end = 1;
|
|
166 foreach $id (keys(%read_starts)) {
|
|
167 if ($min_start > $read_starts{$id}) {
|
|
168 $min_start = $read_starts{$id};
|
|
169 }
|
|
170 if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) {
|
|
171 $max_end = $read_starts{$id}+$read_lengths{$id}-1;
|
|
172 }
|
|
173 }
|
|
174 if ($min_start < 1) {
|
|
175 $shift = abs($min_start) +1;
|
|
176 $assembly_length = $shift + $max_end;
|
|
177 } else {
|
|
178 $assembly_length = $max_end;
|
|
179 $shift = 0;
|
|
180 }
|
|
181
|
|
182 for ($f=1;$f<=$shift;$f++) {
|
|
183 $assembly_seq[$f] = "-";
|
|
184 }
|
|
185 for ($f=1;$f<=$cont_length;$f++) {
|
|
186 $assembly_seq[$f+$shift] = substr($cons,$f-1,1);
|
|
187 }
|
|
188 for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) {
|
|
189 $assembly_seq[$f] = "+";
|
|
190 }
|
|
191
|
|
192 for ($f=1;$f<=$assembly_length;$f++) {
|
|
193 $assembly_char[$f] = 0;
|
|
194 $assembly_masked[$f] = 0;
|
|
195 }
|
|
196 foreach $id (keys(%read_starts)) {
|
|
197 for ($f=1;$f<=$read_lengths{$id};$f++) {
|
|
198 $poz = $read_starts{$id} + $shift + $f - 1;
|
|
199 if ($f>=$read_from{$id} and $f<=$read_to{$id}) {
|
|
200 $assembly_char[$poz]++;
|
|
201 } else {
|
|
202 $assembly_masked[$poz]++;
|
|
203 }
|
|
204 }
|
|
205 }
|
|
206 }
|
|
207
|
|
208 sub revcompl {
|
|
209 my ($input) = @_;
|
|
210 my ($base,$seq,$f);
|
|
211
|
|
212 $seq = "";
|
|
213 for ($f=length($input)-1;$f>=0;$f--) {
|
|
214 $base = substr($input,$f,1);
|
|
215 if ($base eq "A") {
|
|
216 $seq .= "T";
|
|
217 } elsif ($base eq "T") {
|
|
218 $seq .= "A";
|
|
219 } elsif ($base eq "G") {
|
|
220 $seq .= "C";
|
|
221 } elsif ($base eq "C") {
|
|
222 $seq .= "G";
|
|
223 } elsif ($base eq "+" or $base eq "-") {
|
|
224 $seq .= $base;
|
|
225 } else {
|
|
226 $seq .= "N";
|
|
227 }
|
|
228 }
|
|
229 return $seq;
|
|
230 }
|
|
231
|
|
232 sub find_sites {
|
|
233 my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos);
|
|
234 my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region);
|
|
235
|
|
236 # find positions of LTR borders (TG and CA)
|
|
237 @TG = (); # positions of "T"
|
|
238 @CA = (); # positions of "A"
|
|
239 for ($f=1;$f<$assembly_length;$f++) {
|
|
240 if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") {
|
|
241 push(@TG,$f);
|
|
242 }
|
|
243 if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") {
|
|
244 push(@CA,$f+1);
|
|
245 }
|
|
246 }
|
|
247
|
|
248 foreach $pos (@TG) {
|
|
249 if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) {
|
|
250 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
|
|
251 for ($f=$pos;$f<$pos+$window;$f++) {
|
|
252 $site_sum_char += $assembly_char[$f];
|
|
253 $site_sum_masked += $assembly_masked[$f];
|
|
254 $site_seq .= $assembly_seq[$f];
|
|
255 }
|
|
256 $out_sum_char = 0; $out_sum_masked = 0;
|
|
257 for ($f=$pos-$window;$f<$pos;$f++) {
|
|
258 $out_sum_char += $assembly_char[$f];
|
|
259 $out_sum_masked += $assembly_masked[$f];
|
|
260 }
|
|
261 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site
|
|
262 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site
|
|
263 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
|
|
264 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
|
|
265 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
|
|
266 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
|
|
267 print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
|
|
268 $region = "";
|
|
269 for ($f=$pos;$f<=$assembly_length;$f++) {
|
|
270 if ($assembly_seq[$f] ne "*") {
|
|
271 $region .= $assembly_seq[$f];
|
|
272 }
|
|
273 if (length($region) == $extract_region) {
|
|
274 $f = $assembly_length; # terminate cycle
|
|
275 }
|
|
276 }
|
|
277 print OUT "$region\t";
|
|
278 print LTR ">CL",$cl,"c".$contig."_TG_$pos\n";
|
|
279 $region = &revcompl($region);
|
|
280 print LTR "$region\n";
|
|
281 $region = "";
|
|
282 for ($f=$pos-1;$f>0;$f=$f-1) {
|
|
283 if ($assembly_seq[$f] ne "*") {
|
|
284 $region = $assembly_seq[$f].$region;
|
|
285 }
|
|
286 if (length($region) == $extract_region) {
|
|
287 $f = 0; # terminate cycle
|
|
288 }
|
|
289 }
|
|
290 print OUT "$region\n";
|
|
291 print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n";
|
|
292 $region = &revcompl($region);
|
|
293 print ADJ "$region\n";
|
|
294 }
|
|
295 }
|
|
296 }
|
|
297 }
|
|
298
|
|
299 foreach $pos (@CA) {
|
|
300 if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {
|
|
301 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
|
|
302 for ($f=$pos-$window+1;$f<=$pos;$f++) {
|
|
303 $site_sum_char += $assembly_char[$f];
|
|
304 $site_sum_masked += $assembly_masked[$f];
|
|
305 $site_seq .= $assembly_seq[$f];
|
|
306 }
|
|
307 $out_sum_char = 0; $out_sum_masked = 0;
|
|
308 for ($f=$pos+1;$f<=$pos+$window;$f++) {
|
|
309 $out_sum_char += $assembly_char[$f];
|
|
310 $out_sum_masked += $assembly_masked[$f];
|
|
311 }
|
|
312 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site
|
|
313 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site
|
|
314 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
|
|
315 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
|
|
316 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
|
|
317 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
|
|
318 print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
|
|
319 $region = "";
|
|
320 for ($f=$pos;$f>0;$f=$f-1) {
|
|
321 if ($assembly_seq[$f] ne "*") {
|
|
322 $region = $assembly_seq[$f].$region;
|
|
323 }
|
|
324 if (length($region) == $extract_region) {
|
|
325 $f = 0; # terminate cycle
|
|
326 }
|
|
327 }
|
|
328 print OUT "$region\t";
|
|
329 print LTR ">CL",$cl,"c".$contig."_CA_$pos\n";
|
|
330 print LTR "$region\n";
|
|
331 $region = "";
|
|
332 for ($f=$pos+1;$f<=$assembly_length;$f++) {
|
|
333 if ($assembly_seq[$f] ne "*") {
|
|
334 $region .= $assembly_seq[$f];
|
|
335 }
|
|
336 if (length($region) == $extract_region) {
|
|
337 $f = $assembly_length; # terminate cycle
|
|
338 }
|
|
339 }
|
|
340 print OUT "$region\n";
|
|
341 print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n";
|
|
342 print ADJ "$region\n";
|
|
343 }
|
|
344 }
|
|
345 }
|
|
346 }
|
|
347 }
|
|
348
|
|
349 sub add_PBS_info {
|
|
350 my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);
|
|
351
|
|
352 $pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";
|
|
353
|
|
354 @table = ();
|
|
355 open (TAB,$out_table) or die;
|
|
356 while ($rad = <TAB>) {
|
|
357 push(@table,$rad);
|
|
358 $tab_length++;
|
|
359 }
|
|
360 close TAB;
|
|
361
|
|
362 open (BLAST,"$pbs_blast_command |") or die;
|
|
363 $prev_query = "";
|
|
364 while ($rad = <BLAST>) {
|
|
365 if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) {
|
|
366 if ("$1\t$2\t$3\t$4" ne $prev_query) { # to exclude additional HSPs from the same query/subject pair
|
|
367 for ($f=0;$f<$tab_length;$f++) {
|
|
368 @pol = split(/\t/,$table[$f]);
|
|
369 if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {
|
|
370 chomp($table[$f]);
|
|
371 $table[$f] .= "\t$rad";
|
|
372 $f = $tab_length; # terminate cycle
|
|
373 }
|
|
374 }
|
|
375 $prev_query = "$1\t$2\t$3\t$4";
|
|
376 }
|
|
377 }
|
|
378 }
|
|
379 close BLAST;
|
|
380
|
|
381 open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;
|
|
382 for ($f=0;$f<$tab_length;$f++) {
|
|
383 print TAB_WITH_BLAST $table[$f];
|
|
384 }
|
|
385 close TAB_WITH_BLAST;
|
|
386 }
|
|
387
|
|
388
|
|
389
|
|
390
|