|
2
|
1 #!/usr/bin/perl -w
|
|
|
2 #use strict;
|
|
|
3
|
|
|
4 %arguments=
|
|
|
5 (
|
|
|
6 "vcf"=>"", #file mandatory, provided at runtime
|
|
|
7 "disease"=>"", #name optional
|
|
|
8 "similarD"=>"", #file optional
|
|
|
9 "lgenes"=>"", #file optional
|
|
|
10 "leQTL"=>"qfile", #file mandatory, but default value
|
|
|
11 "keywords"=>"kfile", #file mandatory, but default value
|
|
|
12 "effects"=>"efile", #file mandatory, but default value
|
|
|
13 "disease_clinvar"=>8, #numeric mandadory, but default value
|
|
|
14 "score_AF"=>4, #numeric mandatory, but default value
|
|
|
15 "score_functional"=>8, #numeric mandatory, but default value
|
|
|
16 "score_NS"=>6, #numeric mandatory, but default value
|
|
|
17 "score_nIND"=>8, #numeric mandatory, but default value
|
|
|
18 "AF"=>0.0001, #numeric mandatory, but default value
|
|
|
19 "scoreeQTL"=>1, #numeric mandatory, but default value
|
|
|
20 "nind"=>5, #numeric mandatory, but default value
|
|
|
21 "scoreG"=>2, #numeric mandatory, but default value
|
|
|
22 #####OUTPUT file#############################################
|
|
|
23 "ofile"=>"final_res.csv", #file #OUTPUT #tabulare
|
|
|
24 "ovcfile"=>"final_res.vcf" #file #OUTOUT #vcf
|
|
|
25 );
|
|
|
26
|
|
|
27
|
|
|
28 @arguments=@ARGV;
|
|
|
29 for ($i=0;$i<=$#ARGV;$i+=2)
|
|
|
30 {
|
|
|
31 $act=$ARGV[$i];
|
|
|
32 $act=~s/-//g;
|
|
|
33 $val=$ARGV[$i+1];
|
|
|
34 if (exists $arguments{$act})
|
|
|
35 {
|
|
|
36 $arguments{$act}=$val;
|
|
|
37 }else{
|
|
|
38 warn("$act: unknown argument\n");
|
|
|
39 @valid=keys %arguments;
|
|
|
40 warn("Valid arguments are @valid\n");
|
|
|
41 die("All those moments will be lost in time, like tears in rain.\n Time to die!\n");
|
|
|
42 }
|
|
|
43 #print "$act $val\n";
|
|
|
44 }
|
|
|
45
|
|
|
46 $ofile_name=$arguments{"ofile"};
|
|
|
47 open(O,">$ofile_name");
|
|
|
48 $ovcfile=$arguments{"ovcfile"};
|
|
|
49 open(OV,">$ovcfile");
|
|
|
50
|
|
|
51
|
|
|
52 $file=$arguments{"vcf"} ;
|
|
|
53 die ("input file $file not found!\n") unless -e $file;
|
|
|
54
|
|
|
55 $lgenes=$arguments{"lgenes"};
|
|
|
56 if (-e $lgenes)
|
|
|
57 {
|
|
|
58 open(IN,$lgenes);
|
|
|
59 while(<IN>)
|
|
|
60 {
|
|
|
61 chomp;
|
|
|
62 $Lgenes{$_}=1;
|
|
|
63 }
|
|
|
64 }
|
|
|
65
|
|
|
66
|
|
|
67 $kfile=$arguments{"keywords"};
|
|
|
68 die ("keyword file $kfile not found!\n") unless -e $kfile;
|
|
|
69 open(IN,$kfile);
|
|
|
70 while(<IN>)
|
|
|
71 {
|
|
|
72 chomp;
|
|
|
73 $specialKeys{$_}=1;
|
|
|
74 }
|
|
|
75
|
|
|
76
|
|
|
77 $diseaseO=$arguments{"disease"} ? $arguments{"disease"} : "GinocchioValgoDellaLavandaiaZoppa";
|
|
|
78
|
|
|
79 $sfile=$arguments{"similarD"};
|
|
|
80 if (-e $sfile)
|
|
|
81 {
|
|
|
82 open(IN,$sfile);
|
|
|
83 while(<IN>)
|
|
|
84 {
|
|
|
85 chomp;
|
|
|
86 push(@kw,$_)
|
|
|
87 }
|
|
|
88 }
|
|
|
89
|
|
|
90 $efile=$arguments{"effects"};
|
|
|
91 die ("effect file $efile not found!\n") unless -e $efile;
|
|
|
92 open(IN,$efile);
|
|
|
93 while(<IN>)
|
|
|
94 {
|
|
|
95 chomp;
|
|
|
96 $effects{$_}=1;
|
|
|
97 }
|
|
|
98
|
|
|
99 $leQTLfile=$arguments{"leQTL"};
|
|
|
100 if (-e $leQTLfile)
|
|
|
101 {
|
|
|
102 open(IN,$leQTLfile);
|
|
|
103 while(<IN>)
|
|
|
104 {
|
|
|
105 chomp();
|
|
|
106 $Qlist{$_}=1;
|
|
|
107 }
|
|
|
108 }
|
|
|
109
|
|
|
110 $disease_clinvar=$arguments{"disease_clinvar"};
|
|
|
111 $score_AF=$arguments{"score_AF"};
|
|
|
112 $score_functional=$arguments{"score_functional"};
|
|
|
113 $score_NS=$arguments{"score_NS"};
|
|
|
114 $score_nIND=$arguments{"score_nIND"};
|
|
|
115 $score_QTL=$arguments{"scoreeQTL"};
|
|
|
116 $scoreG=$arguments{"scoreG"};
|
|
|
117
|
|
|
118 print O "CHR\tstart\tgene\tref\talt\tAC\t";
|
|
|
119 foreach $k (sort keys %specialKeys)
|
|
|
120 {
|
|
|
121 print O "$k\t";
|
|
|
122 }
|
|
|
123 print O "Score\n";
|
|
|
124
|
|
|
125 %damaged=();
|
|
|
126 open(IN,$file);
|
|
|
127
|
|
|
128 #print "$gene_score $disease_HGMD $disease_clinvar $score_functional $score_AF $score_NS $score_eQTL $score_nIND\n";
|
|
|
129 while(<IN>)
|
|
|
130 {
|
|
|
131 if ($_=~/^#/)
|
|
|
132 {
|
|
|
133 print OV;
|
|
|
134 next;
|
|
|
135 }
|
|
|
136 chomp();
|
|
|
137 @val=(split(/\t/));
|
|
|
138 $chr=$val[0];
|
|
|
139 $start=$val[1];
|
|
|
140 $pstart=$val[2];
|
|
|
141 $b1=$val[3];
|
|
|
142 $b2=$val[4];
|
|
|
143 $qscore=$val[5];
|
|
|
144 $pb2=$val[6];
|
|
|
145 $gene="na";
|
|
|
146 $annot=$val[7];
|
|
|
147 $gt=$val[8];
|
|
|
148 @samples=@val[9..$#val];
|
|
|
149 $samples=(join("\t",@samples));
|
|
|
150 @terms=(split(/\;/,$annot));
|
|
|
151 $DIS=0;
|
|
|
152 if ($_=~/;AC=(\d+);/)
|
|
|
153 {
|
|
|
154 $nind=$1;
|
|
|
155 }
|
|
|
156 if ($_=~/Gene.refGene=(\w+);/)
|
|
|
157 {
|
|
|
158 $gene=$1;
|
|
|
159 }
|
|
|
160 next if $nind==0;
|
|
|
161 $gene=(split(/\,/,$gene))[0];
|
|
|
162 $i=0;
|
|
|
163 %keep=();
|
|
|
164 $score=0;
|
|
|
165 $score+=$scoreG if $Lgenes{$gene};
|
|
|
166 foreach $t (@terms)
|
|
|
167 {
|
|
|
168 ($keep,$value)=(split(/\=/,$t))[0,1];
|
|
|
169 $keep{$keep}=$value;
|
|
|
170 }
|
|
|
171
|
|
|
172 if ($keep{"CLNSIG"} ne "."){
|
|
|
173 $scoreO=0;
|
|
|
174 $scoreC=0;
|
|
|
175 $add=0;
|
|
|
176 $add=$disease_clinvar if ($keep{"CLNSIG"} eq "Pathogenic" || $keep{"CLNSIG"} eq "Pathogenic,_other,_risk_factor" || $keep{"CLNSIG"} eq "pathogenic" || $keep{"CLNSIG"} eq "Pathogenic/Likely_pathogenic" );
|
|
|
177 $add=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Likely_pathogenic" || $keep{"CLNSIG"} eq "Conflicting_interpretations_of_pathogenicity" || $keep{"CLNSIG"} eq "likely-pathogenic");
|
|
|
178 $add=-$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Likely_benign" || $keep{"CLNSIG"} eq "Benign/Likely_benign");
|
|
|
179 $add=-$disease_clinvar if ($keep{"CLNSIG"} eq "Benign");
|
|
|
180 @diseases=split(/\|/,$keep{"CLNDN"});
|
|
|
181 @databases=split(/\|/,$keep{"CLNDISDB"});
|
|
|
182 for ($i=0;$i<=$#diseases;$i++)
|
|
|
183 {
|
|
|
184 $dis=lc $diseases[$i];
|
|
|
185 $dat=$databases[$i];
|
|
|
186 if ($dis=~ /$diseaseO/)
|
|
|
187 {
|
|
|
188 if ($dat=~/OMIM/)
|
|
|
189 {
|
|
|
190 $scoreO=$add;
|
|
|
191 }else{
|
|
|
192 $scoreC=$add;
|
|
|
193 }
|
|
|
194 last;
|
|
|
195 }else{
|
|
|
196 foreach $kv (@kw)
|
|
|
197 {
|
|
|
198 if ($dis=~/$kv/)
|
|
|
199 {
|
|
|
200 $DIS=1 if $add>0;
|
|
|
201 if ($dat=~/OMIM/)
|
|
|
202 {
|
|
|
203 $scoreO=$add;
|
|
|
204 }else{
|
|
|
205 $scoreC=$add;
|
|
|
206 }
|
|
|
207 }
|
|
|
208 }
|
|
|
209 }
|
|
|
210 }
|
|
|
211 $score+=$scoreO+$scoreC;
|
|
|
212 }
|
|
|
213 $esp=$keep{"esp6500siv2_ea"} eq "." ? 0 : $keep{"esp6500siv2_ea"};
|
|
|
214 $g1000=$keep{"1000g2015aug_all"} eq "." ? 0 : $keep{"1000g2015aug_all"} ;
|
|
|
215 $exac=$keep{"ExAC_NFE"} eq "." ? 0 : $keep{"ExAC_NFE"};
|
|
|
216 $gnomad=$keep{"gnomAD_exome_NFE"} eq "." ? 0 : $keep{"gnomAD_exome_NFE"};
|
|
|
217 @AF=($esp,$g1000,$exac,$gnomad);
|
|
|
218 @AF=sort{$a<=>$b} @AF;
|
|
|
219 $AF=$AF[-1];
|
|
|
220 # Variante molto rara 0.0001 2 punti
|
|
|
221 # Variante rara 0.004 1 punto
|
|
|
222 if ($AF<=$arguments{"AF"}) #0,00002
|
|
|
223 {
|
|
|
224 $score+=$score_AF;
|
|
|
225 }elsif($AF>$arguments{"AF"} && $AF<=$arguments{"AF"}*4){
|
|
|
226 $score+=$score_AF/2;
|
|
|
227 }elsif($AF>0.01){ #commonSNP
|
|
|
228 $score-=$score_AF/2;
|
|
|
229 }
|
|
|
230
|
|
|
231 $effectO=(split(/\;/,$keep{"ExonicFunc.refGene"}))[0];
|
|
|
232 if ($effects{$effectO})
|
|
|
233 {
|
|
|
234 $damaged{$gene}{"D"}++;
|
|
|
235 $score+=$score_functional;
|
|
|
236 }
|
|
|
237 #print "Exon " . $keep{"ExonicFunc.refGene"}. " $score\n";
|
|
|
238 if ($keep{"ExonicFunc.refGene"} eq "nonsynonymous_SNV"){
|
|
|
239
|
|
|
240 if ($keep{"MetaSVM_pred"} eq "D" && $keep{"CADD_raw"}>=5)
|
|
|
241 {
|
|
|
242 $score+=$score_NS;
|
|
|
243 $damaged{$gene}{"D"}++;
|
|
|
244 }elsif($keep{"MetaSVM_pred"} eq "D" && $keep{"CADD_raw"}<5){
|
|
|
245 $score+=$score_NS/2;
|
|
|
246 }elsif($keep{"MetaSVM_pred"} ne "D" && $keep{"CADD_raw"}>=5){
|
|
|
247 $score+=$score_NS/2;
|
|
|
248 }
|
|
|
249 }
|
|
|
250 #print "NS ". $keep{"ExonicFunc.refGene"} . " $score\n";
|
|
|
251 $damaged{$gene}{"tot"}++;
|
|
|
252 if ($keep{"Func.refGene"} eq "splicing"){
|
|
|
253 $score+=$score_functional;
|
|
|
254 }
|
|
|
255 #print "SPL ". $keep{"Func.refGene"} . " $score\n";
|
|
|
256 foreach $QTL (keys %Qlist)
|
|
|
257 {
|
|
|
258 next unless $keep{$QTL};
|
|
|
259 $score+=$score_QTL if ($keep{$QTL} ne ".");
|
|
|
260 }
|
|
|
261 if ($nind>=$arguments{"nind"} && $AF<=0.01)
|
|
|
262 {
|
|
|
263 $score+=$score_nIND;
|
|
|
264 }elsif($nind>=$arguments{"nind"}/2 && $nind<$arguments{"nind"} && $AF<=0.01){
|
|
|
265 $score+=$score_nIND/2;
|
|
|
266 }
|
|
|
267 chomp();
|
|
|
268 #$scores{$score}++;
|
|
|
269 chop($_);
|
|
|
270 $outL="";
|
|
|
271 foreach $k (sort keys %specialKeys)
|
|
|
272 {
|
|
|
273 #die("$k\n") unless $keep{$k};
|
|
|
274 $outL.="$keep{$k}\t";
|
|
|
275 }
|
|
|
276 print O "$chr\t$start\t$gene\t$b1\t$b2\t$nind\t$outL$score\n"; #if $score>=12 || $DIS==1;
|
|
|
277 print OV "$chr\t$start\t$pstart\t$b1\t$b2\t$qscore\t$pb2\t$annot;COSO_score=$score\t$gt\t$samples\n";
|
|
|
278 }
|
|
|
279 #open(O,">$file.gc.list.csv");
|
|
|
280 #foreach $gene (sort{$a<=>$b} keys %G)
|
|
|
281 #{
|
|
|
282 # $pos=$G{$gene}{"GR"} ? $G{$gene}{"GR"} : 0;
|
|
|
283 # $neg=$G{$gene}{"LW"} ? $G{$gene}{"LW"} : 0;
|
|
|
284 #print O "$gene $pos $neg\n";
|
|
|
285 ##$Tscore+=$scores{$score} if $score>=$coff;
|
|
|
286 #}
|
|
|
287
|
|
|
288 #open(D,">$file\_damaged.csv");
|
|
|
289 #foreach $gene (sort{$a<=>$b} keys %damaged)
|
|
|
290 #{
|
|
|
291 # $D=$damaged{$gene}{"D"} ? $damaged{$gene}{"D"} : 0;
|
|
|
292 # $T=$damaged{$gene}{"tot"};
|
|
|
293 #print D "$gene $D $T\n";
|
|
|
294 #}
|
|
|
295
|
|
|
296 #print "$Tscore\n";
|