annotate lib/detect_LTR_insertion_sites.pl @ 0:f6ebec6e235e draft

Uploaded
author petrn
date Thu, 19 Dec 2019 13:46:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1 #!/usr/bin/env perl
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
3 # parses ACE files of assembled repeats and detects potential
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
4 # LTR borders/insertion sites of LTR-retroelements
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
5
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
6 # "site" is a region (size of $window) including TG or CA
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
7 # "out" is a region adjacent to the site, presumably representing insertion sites
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
8
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
9 # this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
10 # -m default set to 10
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
11
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
12 use Getopt::Std;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
13
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
14
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
15
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
16
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
17 getopt('iowsmdrp');
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
18 if ($opt_i) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
19 $infile = $opt_i;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
20 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
21 die "-i input_file_name missing\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
22 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
23
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
24 if ($opt_p) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
25 $db_PBS = $opt_p;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
26 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
27 die "-p PBS database is missing\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
28 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
29
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
30
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
31
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
32
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
33 if ($opt_o) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
34 $outfile = $opt_o;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
35 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
36 die "-o output_file_name missing\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
37 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
38 if ($opt_w) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
39 $window = $opt_w;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
40 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
41 $window = 7;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
42 print "window size not set, using default ($window)\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
43 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
44 if ($opt_s) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
45 $min_site_depth = $opt_s; # minimal average read depth (over $window) required for the site
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
46 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
47 $min_site_depth = 10;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
48 print "min_site_depth not set, using default ($min_site_depth)\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
49 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
50 if ($opt_m) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
51 $min_out_masked = $opt_m; # minimal average number of masked reads outside the site (over $window)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
52 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
53 $min_out_masked = 10;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
54 print "min_out_masked not set, using default ($min_out_masked)\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
55 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
56 if ($opt_d) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
57 $min_masked_fold_diff = $opt_d; # how many times should the proportion of masked reads "out" be higher than in "site"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
58 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
59 $min_masked_fold_diff = 3;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
60 print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
61 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
62 if ($opt_x) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
63 $max_char_to_masked = $opt_x; # max fold difference between depth in "site" and masked depth "out"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
64 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
65 $max_char_to_masked = 10;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
66 print "max_char_to_masked not set, using default ($max_char_to_masked)\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
67 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
68 if ($opt_r) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
69 $extract_region = $opt_r;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
70 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
71 $extract_region = 30;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
72 print "extract_region not set, using default ($extract_region)\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
73 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
74
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
75 # main
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
76 $out_table = $outfile;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
77 $out_LTR = "$outfile.LTR";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
78 $out_ADJ = "$outfile.ADJ";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
79 open (IN,$infile) or die;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
80 open (OUT,">$out_table") or die;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
81 open (LTR,">$out_LTR") or die; # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
82 open (ADJ,">$out_ADJ") or die; # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
83 print OUT "#Parameters:\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
84 print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
85 print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
86 print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
87 print "Analyzing ACE file...\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
88 $prev = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
89 while ($radek = <IN>) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
90 $contig_found = &read_contig;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
91 if ($contig_found) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
92 if ($cl > $prev) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
93 $prev = $cl;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
94 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
95 &reconstruct_assembly;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
96 &find_sites;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
97 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
98 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
99 close IN;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
100 close OUT;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
101 close LTR;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
102 close ADJ;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
103 print "Running blast against tRNA database...\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
104 &add_PBS_info; # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
105
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
106 $error = system("rm $out_table");
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
107 if ($error) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
108 print "Error removing $out_table\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
109 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
110
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
111 sub read_contig {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
112 my ($reads_found,$read_id);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
113 # global variables
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
114 $cl = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
115 $contig = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
116 $cont_length = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
117 $reads = 0; # number of reads
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
118 $cons = ""; # contig consensus (including gaps *)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
119 %read_starts = (); # starts of reads within assembly
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
120 %read_lengths = (); # length of reads in assembly (may contain gaps)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
121 %read_from = (); # start of non-masked part of read sequence (relative to the read)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
122 %read_to = (); # end of non-masked part of read sequence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
123
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
124 do {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
125 if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
126 $cl = $1; $contig = $2; $cont_length = $3; $reads = $4;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
127 while ($radek = <IN> and length($radek) > 1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
128 chomp($radek);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
129 $cons .= $radek;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
130 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
131 do {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
132 if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
133 #print "$1 : $2\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
134 $read_starts{$1} = $2;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
135 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
136 } while ($radek = <IN> and not $radek =~/^BS \d+ \d+/);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
137 $reads_found = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
138 while ($reads_found < $reads) { # expects previously specified number of reads
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
139 $radek = <IN>; # expects RD lines with read ids alternate with QA lines
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
140 if ($radek =~/^RD (\S+) (\d+)/) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
141 $read_id = $1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
142 $read_lengths{$read_id} = $2;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
143 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
144 if ($radek =~/^QA (\d+) (\d+)/) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
145 $read_from{$read_id} = $1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
146 $read_to{$read_id} = $2;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
147 $reads_found++;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
148 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
149 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
150 return 1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
151 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
152 } while ($radek = <IN>);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
153 return 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
154 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
155
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
156 sub reconstruct_assembly {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
157 my ($id,$min_start,$max_end,$shift,$f,$poz);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
158 # global variables
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
159 @assembly_seq = (); # sequence at each position of assembly; it corresponds to consensus, or regions
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
160 # before (-) and after (+) consensus [assembly may be longer than consensus]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
161 @assembly_char = (); # number of assembly positions with non-masked characters (nucleotides or gaps)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
162 @assembly_masked = (); # number of masked positions
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
163 $assembly_length = 0; # total length, including - and + regions
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
164
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
165 $min_start = 1; $max_end = 1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
166 foreach $id (keys(%read_starts)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
167 if ($min_start > $read_starts{$id}) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
168 $min_start = $read_starts{$id};
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
169 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
170 if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
171 $max_end = $read_starts{$id}+$read_lengths{$id}-1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
172 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
173 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
174 if ($min_start < 1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
175 $shift = abs($min_start) +1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
176 $assembly_length = $shift + $max_end;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
177 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
178 $assembly_length = $max_end;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
179 $shift = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
180 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
181
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
182 for ($f=1;$f<=$shift;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
183 $assembly_seq[$f] = "-";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
184 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
185 for ($f=1;$f<=$cont_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
186 $assembly_seq[$f+$shift] = substr($cons,$f-1,1);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
187 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
188 for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
189 $assembly_seq[$f] = "+";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
190 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
191
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
192 for ($f=1;$f<=$assembly_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
193 $assembly_char[$f] = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
194 $assembly_masked[$f] = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
195 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
196 foreach $id (keys(%read_starts)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
197 for ($f=1;$f<=$read_lengths{$id};$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
198 $poz = $read_starts{$id} + $shift + $f - 1;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
199 if ($f>=$read_from{$id} and $f<=$read_to{$id}) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
200 $assembly_char[$poz]++;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
201 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
202 $assembly_masked[$poz]++;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
203 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
204 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
205 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
206 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
207
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
208 sub revcompl {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
209 my ($input) = @_;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
210 my ($base,$seq,$f);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
211
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
212 $seq = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
213 for ($f=length($input)-1;$f>=0;$f--) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
214 $base = substr($input,$f,1);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
215 if ($base eq "A") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
216 $seq .= "T";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
217 } elsif ($base eq "T") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
218 $seq .= "A";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
219 } elsif ($base eq "G") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
220 $seq .= "C";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
221 } elsif ($base eq "C") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
222 $seq .= "G";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
223 } elsif ($base eq "+" or $base eq "-") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
224 $seq .= $base;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
225 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
226 $seq .= "N";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
227 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
228 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
229 return $seq;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
230 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
231
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
232 sub find_sites {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
233 my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
234 my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
235
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
236 # find positions of LTR borders (TG and CA)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
237 @TG = (); # positions of "T"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
238 @CA = (); # positions of "A"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
239 for ($f=1;$f<$assembly_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
240 if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
241 push(@TG,$f);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
242 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
243 if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
244 push(@CA,$f+1);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
245 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
246 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
247
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
248 foreach $pos (@TG) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
249 if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
250 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
251 for ($f=$pos;$f<$pos+$window;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
252 $site_sum_char += $assembly_char[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
253 $site_sum_masked += $assembly_masked[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
254 $site_seq .= $assembly_seq[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
255 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
256 $out_sum_char = 0; $out_sum_masked = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
257 for ($f=$pos-$window;$f<$pos;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
258 $out_sum_char += $assembly_char[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
259 $out_sum_masked += $assembly_masked[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
260 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
261 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
262 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
263 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
264 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
265 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
266 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
267 print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
268 $region = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
269 for ($f=$pos;$f<=$assembly_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
270 if ($assembly_seq[$f] ne "*") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
271 $region .= $assembly_seq[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
272 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
273 if (length($region) == $extract_region) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
274 $f = $assembly_length; # terminate cycle
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
275 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
276 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
277 print OUT "$region\t";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
278 print LTR ">CL",$cl,"c".$contig."_TG_$pos\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
279 $region = &revcompl($region);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
280 print LTR "$region\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
281 $region = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
282 for ($f=$pos-1;$f>0;$f=$f-1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
283 if ($assembly_seq[$f] ne "*") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
284 $region = $assembly_seq[$f].$region;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
285 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
286 if (length($region) == $extract_region) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
287 $f = 0; # terminate cycle
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
288 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
289 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
290 print OUT "$region\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
291 print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
292 $region = &revcompl($region);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
293 print ADJ "$region\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
294 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
295 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
296 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
297 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
298
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
299 foreach $pos (@CA) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
300 if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
301 $site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
302 for ($f=$pos-$window+1;$f<=$pos;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
303 $site_sum_char += $assembly_char[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
304 $site_sum_masked += $assembly_masked[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
305 $site_seq .= $assembly_seq[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
306 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
307 $out_sum_char = 0; $out_sum_masked = 0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
308 for ($f=$pos+1;$f<=$pos+$window;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
309 $out_sum_char += $assembly_char[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
310 $out_sum_masked += $assembly_masked[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
311 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
312 $site_depth = sprintf("%0.1f",$site_sum_char/$window); # average read (unmasked) depth over the site
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
313 $out_masked = sprintf("%0.1f",$out_sum_masked/$window); # average number of masked reads outside the site
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
314 $masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
315 $masked_ratio_out = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
316 if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
317 if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
318 print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
319 $region = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
320 for ($f=$pos;$f>0;$f=$f-1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
321 if ($assembly_seq[$f] ne "*") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
322 $region = $assembly_seq[$f].$region;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
323 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
324 if (length($region) == $extract_region) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
325 $f = 0; # terminate cycle
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
326 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
327 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
328 print OUT "$region\t";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
329 print LTR ">CL",$cl,"c".$contig."_CA_$pos\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
330 print LTR "$region\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
331 $region = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
332 for ($f=$pos+1;$f<=$assembly_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
333 if ($assembly_seq[$f] ne "*") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
334 $region .= $assembly_seq[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
335 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
336 if (length($region) == $extract_region) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
337 $f = $assembly_length; # terminate cycle
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
338 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
339 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
340 print OUT "$region\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
341 print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
342 print ADJ "$region\n";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
343 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
344 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
345 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
346 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
347 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
348
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
349 sub add_PBS_info {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
350 my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
351
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
352 $pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
353
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
354 @table = ();
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
355 open (TAB,$out_table) or die;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
356 while ($rad = <TAB>) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
357 push(@table,$rad);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
358 $tab_length++;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
359 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
360 close TAB;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
361
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
362 open (BLAST,"$pbs_blast_command |") or die;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
363 $prev_query = "";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
364 while ($rad = <BLAST>) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
365 if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
366 if ("$1\t$2\t$3\t$4" ne $prev_query) { # to exclude additional HSPs from the same query/subject pair
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
367 for ($f=0;$f<$tab_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
368 @pol = split(/\t/,$table[$f]);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
369 if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
370 chomp($table[$f]);
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
371 $table[$f] .= "\t$rad";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
372 $f = $tab_length; # terminate cycle
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
373 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
374 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
375 $prev_query = "$1\t$2\t$3\t$4";
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
376 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
377 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
378 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
379 close BLAST;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
380
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
381 open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
382 for ($f=0;$f<$tab_length;$f++) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
383 print TAB_WITH_BLAST $table[$f];
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
384 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
385 close TAB_WITH_BLAST;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
386 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
387
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
388
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
389
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
390