comparison lib/detect_LTR_insertion_sites.pl @ 0:f6ebec6e235e draft

Uploaded
author petrn
date Thu, 19 Dec 2019 13:46:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f6ebec6e235e
1 #!/usr/bin/env perl
2
3 # parses ACE files of assembled repeats and detects potential
4 # LTR borders/insertion sites of LTR-retroelements
5
6 # "site" is a region (size of $window) including TG or CA
7 # "out" is a region adjacent to the site, presumably representing insertion sites
8
9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"
10 # -m default set to 10
11
12 use Getopt::Std;
13
14
15
16
17 getopt('iowsmdrp');
18 if ($opt_i) {
19 $infile = $opt_i;
20 } else {
21 die "-i input_file_name missing\n";
22 }
23
24 if ($opt_p) {
25 $db_PBS = $opt_p;
26 } else {
27 die "-p PBS database is missing\n";
28 }
29
30
31
32
33 if ($opt_o) {
34 $outfile = $opt_o;
35 } else {
36 die "-o output_file_name missing\n";
37 }
38 if ($opt_w) {
39 $window = $opt_w;
40 } else {
41 $window = 7;
42 print "window size not set, using default ($window)\n";
43 }
44 if ($opt_s) {
45 $min_site_depth = $opt_s; # minimal average read depth (over $window) required for the site
46 } else {
47 $min_site_depth = 10;
48 print "min_site_depth not set, using default ($min_site_depth)\n";
49 }
50 if ($opt_m) {
51 $min_out_masked = $opt_m; # minimal average number of masked reads outside the site (over $window)
52 } else {
53 $min_out_masked = 10;
54 print "min_out_masked not set, using default ($min_out_masked)\n";
55 }
56 if ($opt_d) {
57 $min_masked_fold_diff = $opt_d; # how many times should the proportion of masked reads "out" be higher than in "site"
58 } else {
59 $min_masked_fold_diff = 3;
60 print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n";
61 }
62 if ($opt_x) {
63 $max_char_to_masked = $opt_x; # max fold difference between depth in "site" and masked depth "out"
64 } else {
65 $max_char_to_masked = 10;
66 print "max_char_to_masked not set, using default ($max_char_to_masked)\n";
67 }
68 if ($opt_r) {
69 $extract_region = $opt_r;
70 } else {
71 $extract_region = 30;
72 print "extract_region not set, using default ($extract_region)\n";
73 }
74
75 # main
76 $out_table = $outfile;
77 $out_LTR = "$outfile.LTR";
78 $out_ADJ = "$outfile.ADJ";
79 open (IN,$infile) or die;
80 open (OUT,">$out_table") or die;
81 open (LTR,">$out_LTR") or die; # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)
82 open (ADJ,">$out_ADJ") or die; # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region
83 print OUT "#Parameters:\n";
84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n";
85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n";
86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n";
87 print "Analyzing ACE file...\n";
88 $prev = 0;
89 while ($radek = <IN>) {
90 $contig_found = &read_contig;
91 if ($contig_found) {
92 if ($cl > $prev) {
93 $prev = $cl;
94 }
95 &reconstruct_assembly;
96 &find_sites;
97 }
98 }
99 close IN;
100 close OUT;
101 close LTR;
102 close ADJ;
103 print "Running blast against tRNA database...\n";
104 &add_PBS_info; # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)
105
106 $error = system("rm $out_table");
107 if ($error) {
108 print "Error removing $out_table\n";
109 }
110
111 sub read_contig {
112 my ($reads_found,$read_id);
113 # global variables
114 $cl = 0;
115 $contig = 0;
116 $cont_length = 0;
117 $reads = 0; # number of reads
118 $cons = ""; # contig consensus (including gaps *)
119 %read_starts = (); # starts of reads within assembly
120 %read_lengths = (); # length of reads in assembly (may contain gaps)
121 %read_from = (); # start of non-masked part of read sequence (relative to the read)
122 %read_to = (); # end of non-masked part of read sequence
123
124 do {
125 if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) {
126 $cl = $1; $contig = $2; $cont_length = $3; $reads = $4;
127 while ($radek = <IN> and length($radek) > 1) {
128 chomp($radek);
129 $cons .= $radek;
130 }
131 do {
132 if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) {
133 #print "$1 : $2\n";
134 $read_starts{$1} = $2;
135 }
136 } while ($radek = <IN> and not $radek =~/^BS \d+ \d+/);
137 $reads_found = 0;
138 while ($reads_found < $reads) { # expects previously specified number of reads
139 $radek = <IN>; # expects RD lines with read ids alternate with QA lines
140 if ($radek =~/^RD (\S+) (\d+)/) {
141 $read_id = $1;
142 $read_lengths{$read_id} = $2;
143 }
144 if ($radek =~/^QA (\d+) (\d+)/) {
145 $read_from{$read_id} = $1;
146 $read_to{$read_id} = $2;
147 $reads_found++;
148 }
149 }
150 return 1;
151 }
152 } while ($radek = <IN>);
153 return 0;
154 }
155
156 sub reconstruct_assembly {
157 my ($id,$min_start,$max_end,$shift,$f,$poz);
158 # global variables
159 @assembly_seq = (); # sequence at each position of assembly; it corresponds to consensus, or regions
160 # before (-) and after (+) consensus [assembly may be longer than consensus]
161 @assembly_char = (); # number of assembly positions with non-masked characters (nucleotides or gaps)
162 @assembly_masked = (); # number of masked positions
163 $assembly_length = 0; # total length, including - and + regions
164
165 $min_start = 1; $max_end = 1;
166 foreach $id (keys(%read_starts)) {
167 if ($min_start > $read_starts{$id}) {
168 $min_start = $read_starts{$id};
169 }
170 if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) {
171 $max_end = $read_starts{$id}+$read_lengths{$id}-1;
172 }
173 }
174 if ($min_start < 1) {
175 $shift = abs($min_start) +1;
176 $assembly_length = $shift + $max_end;
177 } else {
178 $assembly_length = $max_end;
179 $shift = 0;
180 }
181
182 for ($f=1;$f<=$shift;$f++) {
183 $assembly_seq[$f] = "-";
184 }
185 for ($f=1;$f<=$cont_length;$f++) {
186 $assembly_seq[$f+$shift] = substr($cons,$f-1,1);
187 }
188 for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) {
189 $assembly_seq[$f] = "+";
190 }
191
192 for ($f=1;$f<=$assembly_length;$f++) {
193 $assembly_char[$f] = 0;
194 $assembly_masked[$f] = 0;
195 }
196 foreach $id (keys(%read_starts)) {
197 for ($f=1;$f<=$read_lengths{$id};$f++) {
198 $poz = $read_starts{$id} + $shift + $f - 1;
199 if ($f>=$read_from{$id} and $f<=$read_to{$id}) {
200 $assembly_char[$poz]++;
201 } else {
202 $assembly_masked[$poz]++;
203 }
204 }
205 }
206 }
207
208 sub revcompl {
209 my ($input) = @_;
210 my ($base,$seq,$f);
211
212 $seq = "";
213 for ($f=length($input)-1;$f>=0;$f--) {
214 $base = substr($input,$f,1);
215 if ($base eq "A") {
216 $seq .= "T";
217 } elsif ($base eq "T") {
218 $seq .= "A";
219 } elsif ($base eq "G") {
220 $seq .= "C";
221 } elsif ($base eq "C") {
222 $seq .= "G";
223 } elsif ($base eq "+" or $base eq "-") {
224 $seq .= $base;
225 } else {
226 $seq .= "N";
227 }
228 }
229 return $seq;
230 }
231
232 sub find_sites {
233 my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos);
234 my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region);
235
236 # find positions of LTR borders (TG and CA)
237 @TG = (); # positions of "T"
238 @CA = (); # positions of "A"
239 for ($f=1;$f<$assembly_length;$f++) {
240 if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") {
241 push(@TG,$f);
242 }
243 if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") {
244 push(@CA,$f+1);
245 }
246 }
247
248 foreach $pos (@TG) {
249 if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) {
250 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
251 for ($f=$pos;$f<$pos+$window;$f++) {
252 $site_sum_char += $assembly_char[$f];
253 $site_sum_masked += $assembly_masked[$f];
254 $site_seq .= $assembly_seq[$f];
255 }
256 $out_sum_char = 0; $out_sum_masked = 0;
257 for ($f=$pos-$window;$f<$pos;$f++) {
258 $out_sum_char += $assembly_char[$f];
259 $out_sum_masked += $assembly_masked[$f];
260 }
261 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site
262 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site
263 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
264 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
265 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
266 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
267 print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
268 $region = "";
269 for ($f=$pos;$f<=$assembly_length;$f++) {
270 if ($assembly_seq[$f] ne "*") {
271 $region .= $assembly_seq[$f];
272 }
273 if (length($region) == $extract_region) {
274 $f = $assembly_length; # terminate cycle
275 }
276 }
277 print OUT "$region\t";
278 print LTR ">CL",$cl,"c".$contig."_TG_$pos\n";
279 $region = &revcompl($region);
280 print LTR "$region\n";
281 $region = "";
282 for ($f=$pos-1;$f>0;$f=$f-1) {
283 if ($assembly_seq[$f] ne "*") {
284 $region = $assembly_seq[$f].$region;
285 }
286 if (length($region) == $extract_region) {
287 $f = 0; # terminate cycle
288 }
289 }
290 print OUT "$region\n";
291 print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n";
292 $region = &revcompl($region);
293 print ADJ "$region\n";
294 }
295 }
296 }
297 }
298
299 foreach $pos (@CA) {
300 if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {
301 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
302 for ($f=$pos-$window+1;$f<=$pos;$f++) {
303 $site_sum_char += $assembly_char[$f];
304 $site_sum_masked += $assembly_masked[$f];
305 $site_seq .= $assembly_seq[$f];
306 }
307 $out_sum_char = 0; $out_sum_masked = 0;
308 for ($f=$pos+1;$f<=$pos+$window;$f++) {
309 $out_sum_char += $assembly_char[$f];
310 $out_sum_masked += $assembly_masked[$f];
311 }
312 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site
313 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site
314 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
315 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
316 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
317 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
318 print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
319 $region = "";
320 for ($f=$pos;$f>0;$f=$f-1) {
321 if ($assembly_seq[$f] ne "*") {
322 $region = $assembly_seq[$f].$region;
323 }
324 if (length($region) == $extract_region) {
325 $f = 0; # terminate cycle
326 }
327 }
328 print OUT "$region\t";
329 print LTR ">CL",$cl,"c".$contig."_CA_$pos\n";
330 print LTR "$region\n";
331 $region = "";
332 for ($f=$pos+1;$f<=$assembly_length;$f++) {
333 if ($assembly_seq[$f] ne "*") {
334 $region .= $assembly_seq[$f];
335 }
336 if (length($region) == $extract_region) {
337 $f = $assembly_length; # terminate cycle
338 }
339 }
340 print OUT "$region\n";
341 print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n";
342 print ADJ "$region\n";
343 }
344 }
345 }
346 }
347 }
348
349 sub add_PBS_info {
350 my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);
351
352 $pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";
353
354 @table = ();
355 open (TAB,$out_table) or die;
356 while ($rad = <TAB>) {
357 push(@table,$rad);
358 $tab_length++;
359 }
360 close TAB;
361
362 open (BLAST,"$pbs_blast_command |") or die;
363 $prev_query = "";
364 while ($rad = <BLAST>) {
365 if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) {
366 if ("$1\t$2\t$3\t$4" ne $prev_query) { # to exclude additional HSPs from the same query/subject pair
367 for ($f=0;$f<$tab_length;$f++) {
368 @pol = split(/\t/,$table[$f]);
369 if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {
370 chomp($table[$f]);
371 $table[$f] .= "\t$rad";
372 $f = $tab_length; # terminate cycle
373 }
374 }
375 $prev_query = "$1\t$2\t$3\t$4";
376 }
377 }
378 }
379 close BLAST;
380
381 open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;
382 for ($f=0;$f<$tab_length;$f++) {
383 print TAB_WITH_BLAST $table[$f];
384 }
385 close TAB_WITH_BLAST;
386 }
387
388
389
390