Mercurial > repos > petrn > repeatexplorer
comparison lib/detect_LTR_insertion_sites.pl @ 0:f6ebec6e235e draft
Uploaded
author | petrn |
---|---|
date | Thu, 19 Dec 2019 13:46:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f6ebec6e235e |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 # parses ACE files of assembled repeats and detects potential | |
4 # LTR borders/insertion sites of LTR-retroelements | |
5 | |
6 # "site" is a region (size of $window) including TG or CA | |
7 # "out" is a region adjacent to the site, presumably representing insertion sites | |
8 | |
9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl" | |
10 # -m default set to 10 | |
11 | |
12 use Getopt::Std; | |
13 | |
14 | |
15 | |
16 | |
17 getopt('iowsmdrp'); | |
18 if ($opt_i) { | |
19 $infile = $opt_i; | |
20 } else { | |
21 die "-i input_file_name missing\n"; | |
22 } | |
23 | |
24 if ($opt_p) { | |
25 $db_PBS = $opt_p; | |
26 } else { | |
27 die "-p PBS database is missing\n"; | |
28 } | |
29 | |
30 | |
31 | |
32 | |
33 if ($opt_o) { | |
34 $outfile = $opt_o; | |
35 } else { | |
36 die "-o output_file_name missing\n"; | |
37 } | |
38 if ($opt_w) { | |
39 $window = $opt_w; | |
40 } else { | |
41 $window = 7; | |
42 print "window size not set, using default ($window)\n"; | |
43 } | |
44 if ($opt_s) { | |
45 $min_site_depth = $opt_s; # minimal average read depth (over $window) required for the site | |
46 } else { | |
47 $min_site_depth = 10; | |
48 print "min_site_depth not set, using default ($min_site_depth)\n"; | |
49 } | |
50 if ($opt_m) { | |
51 $min_out_masked = $opt_m; # minimal average number of masked reads outside the site (over $window) | |
52 } else { | |
53 $min_out_masked = 10; | |
54 print "min_out_masked not set, using default ($min_out_masked)\n"; | |
55 } | |
56 if ($opt_d) { | |
57 $min_masked_fold_diff = $opt_d; # how many times should the proportion of masked reads "out" be higher than in "site" | |
58 } else { | |
59 $min_masked_fold_diff = 3; | |
60 print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n"; | |
61 } | |
62 if ($opt_x) { | |
63 $max_char_to_masked = $opt_x; # max fold difference between depth in "site" and masked depth "out" | |
64 } else { | |
65 $max_char_to_masked = 10; | |
66 print "max_char_to_masked not set, using default ($max_char_to_masked)\n"; | |
67 } | |
68 if ($opt_r) { | |
69 $extract_region = $opt_r; | |
70 } else { | |
71 $extract_region = 30; | |
72 print "extract_region not set, using default ($extract_region)\n"; | |
73 } | |
74 | |
75 # main | |
76 $out_table = $outfile; | |
77 $out_LTR = "$outfile.LTR"; | |
78 $out_ADJ = "$outfile.ADJ"; | |
79 open (IN,$infile) or die; | |
80 open (OUT,">$out_table") or die; | |
81 open (LTR,">$out_LTR") or die; # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented) | |
82 open (ADJ,">$out_ADJ") or die; # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region | |
83 print OUT "#Parameters:\n"; | |
84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n"; | |
85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n"; | |
86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n"; | |
87 print "Analyzing ACE file...\n"; | |
88 $prev = 0; | |
89 while ($radek = <IN>) { | |
90 $contig_found = &read_contig; | |
91 if ($contig_found) { | |
92 if ($cl > $prev) { | |
93 $prev = $cl; | |
94 } | |
95 &reconstruct_assembly; | |
96 &find_sites; | |
97 } | |
98 } | |
99 close IN; | |
100 close OUT; | |
101 close LTR; | |
102 close ADJ; | |
103 print "Running blast against tRNA database...\n"; | |
104 &add_PBS_info; # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!) | |
105 | |
106 $error = system("rm $out_table"); | |
107 if ($error) { | |
108 print "Error removing $out_table\n"; | |
109 } | |
110 | |
111 sub read_contig { | |
112 my ($reads_found,$read_id); | |
113 # global variables | |
114 $cl = 0; | |
115 $contig = 0; | |
116 $cont_length = 0; | |
117 $reads = 0; # number of reads | |
118 $cons = ""; # contig consensus (including gaps *) | |
119 %read_starts = (); # starts of reads within assembly | |
120 %read_lengths = (); # length of reads in assembly (may contain gaps) | |
121 %read_from = (); # start of non-masked part of read sequence (relative to the read) | |
122 %read_to = (); # end of non-masked part of read sequence | |
123 | |
124 do { | |
125 if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) { | |
126 $cl = $1; $contig = $2; $cont_length = $3; $reads = $4; | |
127 while ($radek = <IN> and length($radek) > 1) { | |
128 chomp($radek); | |
129 $cons .= $radek; | |
130 } | |
131 do { | |
132 if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) { | |
133 #print "$1 : $2\n"; | |
134 $read_starts{$1} = $2; | |
135 } | |
136 } while ($radek = <IN> and not $radek =~/^BS \d+ \d+/); | |
137 $reads_found = 0; | |
138 while ($reads_found < $reads) { # expects previously specified number of reads | |
139 $radek = <IN>; # expects RD lines with read ids alternate with QA lines | |
140 if ($radek =~/^RD (\S+) (\d+)/) { | |
141 $read_id = $1; | |
142 $read_lengths{$read_id} = $2; | |
143 } | |
144 if ($radek =~/^QA (\d+) (\d+)/) { | |
145 $read_from{$read_id} = $1; | |
146 $read_to{$read_id} = $2; | |
147 $reads_found++; | |
148 } | |
149 } | |
150 return 1; | |
151 } | |
152 } while ($radek = <IN>); | |
153 return 0; | |
154 } | |
155 | |
156 sub reconstruct_assembly { | |
157 my ($id,$min_start,$max_end,$shift,$f,$poz); | |
158 # global variables | |
159 @assembly_seq = (); # sequence at each position of assembly; it corresponds to consensus, or regions | |
160 # before (-) and after (+) consensus [assembly may be longer than consensus] | |
161 @assembly_char = (); # number of assembly positions with non-masked characters (nucleotides or gaps) | |
162 @assembly_masked = (); # number of masked positions | |
163 $assembly_length = 0; # total length, including - and + regions | |
164 | |
165 $min_start = 1; $max_end = 1; | |
166 foreach $id (keys(%read_starts)) { | |
167 if ($min_start > $read_starts{$id}) { | |
168 $min_start = $read_starts{$id}; | |
169 } | |
170 if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) { | |
171 $max_end = $read_starts{$id}+$read_lengths{$id}-1; | |
172 } | |
173 } | |
174 if ($min_start < 1) { | |
175 $shift = abs($min_start) +1; | |
176 $assembly_length = $shift + $max_end; | |
177 } else { | |
178 $assembly_length = $max_end; | |
179 $shift = 0; | |
180 } | |
181 | |
182 for ($f=1;$f<=$shift;$f++) { | |
183 $assembly_seq[$f] = "-"; | |
184 } | |
185 for ($f=1;$f<=$cont_length;$f++) { | |
186 $assembly_seq[$f+$shift] = substr($cons,$f-1,1); | |
187 } | |
188 for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) { | |
189 $assembly_seq[$f] = "+"; | |
190 } | |
191 | |
192 for ($f=1;$f<=$assembly_length;$f++) { | |
193 $assembly_char[$f] = 0; | |
194 $assembly_masked[$f] = 0; | |
195 } | |
196 foreach $id (keys(%read_starts)) { | |
197 for ($f=1;$f<=$read_lengths{$id};$f++) { | |
198 $poz = $read_starts{$id} + $shift + $f - 1; | |
199 if ($f>=$read_from{$id} and $f<=$read_to{$id}) { | |
200 $assembly_char[$poz]++; | |
201 } else { | |
202 $assembly_masked[$poz]++; | |
203 } | |
204 } | |
205 } | |
206 } | |
207 | |
208 sub revcompl { | |
209 my ($input) = @_; | |
210 my ($base,$seq,$f); | |
211 | |
212 $seq = ""; | |
213 for ($f=length($input)-1;$f>=0;$f--) { | |
214 $base = substr($input,$f,1); | |
215 if ($base eq "A") { | |
216 $seq .= "T"; | |
217 } elsif ($base eq "T") { | |
218 $seq .= "A"; | |
219 } elsif ($base eq "G") { | |
220 $seq .= "C"; | |
221 } elsif ($base eq "C") { | |
222 $seq .= "G"; | |
223 } elsif ($base eq "+" or $base eq "-") { | |
224 $seq .= $base; | |
225 } else { | |
226 $seq .= "N"; | |
227 } | |
228 } | |
229 return $seq; | |
230 } | |
231 | |
232 sub find_sites { | |
233 my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos); | |
234 my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region); | |
235 | |
236 # find positions of LTR borders (TG and CA) | |
237 @TG = (); # positions of "T" | |
238 @CA = (); # positions of "A" | |
239 for ($f=1;$f<$assembly_length;$f++) { | |
240 if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") { | |
241 push(@TG,$f); | |
242 } | |
243 if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") { | |
244 push(@CA,$f+1); | |
245 } | |
246 } | |
247 | |
248 foreach $pos (@TG) { | |
249 if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) { | |
250 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = ""; | |
251 for ($f=$pos;$f<$pos+$window;$f++) { | |
252 $site_sum_char += $assembly_char[$f]; | |
253 $site_sum_masked += $assembly_masked[$f]; | |
254 $site_seq .= $assembly_seq[$f]; | |
255 } | |
256 $out_sum_char = 0; $out_sum_masked = 0; | |
257 for ($f=$pos-$window;$f<$pos;$f++) { | |
258 $out_sum_char += $assembly_char[$f]; | |
259 $out_sum_masked += $assembly_masked[$f]; | |
260 } | |
261 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site | |
262 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site | |
263 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char)); | |
264 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char)); | |
265 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) { | |
266 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) { | |
267 print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t"; | |
268 $region = ""; | |
269 for ($f=$pos;$f<=$assembly_length;$f++) { | |
270 if ($assembly_seq[$f] ne "*") { | |
271 $region .= $assembly_seq[$f]; | |
272 } | |
273 if (length($region) == $extract_region) { | |
274 $f = $assembly_length; # terminate cycle | |
275 } | |
276 } | |
277 print OUT "$region\t"; | |
278 print LTR ">CL",$cl,"c".$contig."_TG_$pos\n"; | |
279 $region = &revcompl($region); | |
280 print LTR "$region\n"; | |
281 $region = ""; | |
282 for ($f=$pos-1;$f>0;$f=$f-1) { | |
283 if ($assembly_seq[$f] ne "*") { | |
284 $region = $assembly_seq[$f].$region; | |
285 } | |
286 if (length($region) == $extract_region) { | |
287 $f = 0; # terminate cycle | |
288 } | |
289 } | |
290 print OUT "$region\n"; | |
291 print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n"; | |
292 $region = &revcompl($region); | |
293 print ADJ "$region\n"; | |
294 } | |
295 } | |
296 } | |
297 } | |
298 | |
299 foreach $pos (@CA) { | |
300 if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) { | |
301 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = ""; | |
302 for ($f=$pos-$window+1;$f<=$pos;$f++) { | |
303 $site_sum_char += $assembly_char[$f]; | |
304 $site_sum_masked += $assembly_masked[$f]; | |
305 $site_seq .= $assembly_seq[$f]; | |
306 } | |
307 $out_sum_char = 0; $out_sum_masked = 0; | |
308 for ($f=$pos+1;$f<=$pos+$window;$f++) { | |
309 $out_sum_char += $assembly_char[$f]; | |
310 $out_sum_masked += $assembly_masked[$f]; | |
311 } | |
312 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site | |
313 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site | |
314 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char)); | |
315 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char)); | |
316 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) { | |
317 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) { | |
318 print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t"; | |
319 $region = ""; | |
320 for ($f=$pos;$f>0;$f=$f-1) { | |
321 if ($assembly_seq[$f] ne "*") { | |
322 $region = $assembly_seq[$f].$region; | |
323 } | |
324 if (length($region) == $extract_region) { | |
325 $f = 0; # terminate cycle | |
326 } | |
327 } | |
328 print OUT "$region\t"; | |
329 print LTR ">CL",$cl,"c".$contig."_CA_$pos\n"; | |
330 print LTR "$region\n"; | |
331 $region = ""; | |
332 for ($f=$pos+1;$f<=$assembly_length;$f++) { | |
333 if ($assembly_seq[$f] ne "*") { | |
334 $region .= $assembly_seq[$f]; | |
335 } | |
336 if (length($region) == $extract_region) { | |
337 $f = $assembly_length; # terminate cycle | |
338 } | |
339 } | |
340 print OUT "$region\n"; | |
341 print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n"; | |
342 print ADJ "$region\n"; | |
343 } | |
344 } | |
345 } | |
346 } | |
347 } | |
348 | |
349 sub add_PBS_info { | |
350 my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length); | |
351 | |
352 $pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F"; | |
353 | |
354 @table = (); | |
355 open (TAB,$out_table) or die; | |
356 while ($rad = <TAB>) { | |
357 push(@table,$rad); | |
358 $tab_length++; | |
359 } | |
360 close TAB; | |
361 | |
362 open (BLAST,"$pbs_blast_command |") or die; | |
363 $prev_query = ""; | |
364 while ($rad = <BLAST>) { | |
365 if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) { | |
366 if ("$1\t$2\t$3\t$4" ne $prev_query) { # to exclude additional HSPs from the same query/subject pair | |
367 for ($f=0;$f<$tab_length;$f++) { | |
368 @pol = split(/\t/,$table[$f]); | |
369 if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") { | |
370 chomp($table[$f]); | |
371 $table[$f] .= "\t$rad"; | |
372 $f = $tab_length; # terminate cycle | |
373 } | |
374 } | |
375 $prev_query = "$1\t$2\t$3\t$4"; | |
376 } | |
377 } | |
378 } | |
379 close BLAST; | |
380 | |
381 open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die; | |
382 for ($f=0;$f<$tab_length;$f++) { | |
383 print TAB_WITH_BLAST $table[$f]; | |
384 } | |
385 close TAB_WITH_BLAST; | |
386 } | |
387 | |
388 | |
389 | |
390 |