|
0
|
1 #!/usr/bin/perl -w
|
|
|
2 #use strict;
|
|
|
3
|
|
|
4 %arguments=
|
|
|
5 (
|
|
|
6 "AD"=>"T", #value mandatory, T==TRUE F==FALSE
|
|
|
7 "XL"=>"F", #value mandatory, F==FALSE T==TRUE
|
|
|
8 "vcf"=>"", #file mandatory, provided at runtime
|
|
|
9 "disease"=>"", #name optional
|
|
|
10 "similarD"=>"", #file optional
|
|
|
11 "lgenes"=>"", #file optional
|
|
|
12 "leQTL"=>"", #file optional
|
|
|
13 "keywords"=>"kfile", #file mandatory, but default value
|
|
|
14 "effects"=>"efile", #file mandatory, but default value
|
|
|
15 "disease_clinvar"=>8, #numeric mandadory, but default value
|
|
|
16 "score_AF"=>4, #numeric mandatory, but default value
|
|
|
17 "score_functional"=>8, #numeric mandatory, but default value
|
|
|
18 "score_NS"=>6, #numeric mandatory, but default value
|
|
|
19 "score_nIND"=>8, #numeric mandatory, but default value
|
|
|
20 "AF"=>0.0001, #numeric mandatory, but default value
|
|
|
21 "scoreeQTL"=>1, #numeric mandatory, but default value
|
|
|
22 "nind"=>5, #numeric mandatory, but default value
|
|
|
23 "scoreG"=>2, #numeric mandatory, but default value
|
|
|
24 "ifile"=>"", #file optional
|
|
|
25 "scoreT"=>1, #numeric mandatory, but default value
|
|
|
26 "scoreGW"=>1, #numeric mandatory but default value
|
|
|
27 "scoreM"=>1, #numeric mandatory but default value
|
|
|
28 "scoreR"=>1, #numeric mandatory but default value
|
|
|
29 "scoreSP"=>1, #numeric mandatory but default value
|
|
|
30 #####OUTPUT file#############################################
|
|
|
31 "ofile"=>"final_res.csv", #file #OUTPUT #tabulare
|
|
|
32 "ovcfile"=>"final_res.vcf",#, #file #OUTOUT #vcf
|
|
|
33 "osummary"=>"detailed_final_res.csv"
|
|
|
34 );
|
|
|
35
|
|
|
36 @arguments=@ARGV;
|
|
|
37 for ($i=0;$i<=$#ARGV;$i+=2)
|
|
|
38 {
|
|
|
39 $act=$ARGV[$i];
|
|
|
40 $act=~s/-//g;
|
|
|
41 $val=$ARGV[$i+1];
|
|
|
42 if (exists $arguments{$act})
|
|
|
43 {
|
|
|
44 $arguments{$act}=$val;
|
|
|
45 }else{
|
|
|
46 warn("$act: unknown argument\n");
|
|
|
47 @valid=keys %arguments;
|
|
|
48 warn("Valid arguments are @valid\n");
|
|
|
49 die("All those moments will be lost in time, like tears in rain.\n Time to die!\n");
|
|
|
50 }
|
|
|
51 }
|
|
|
52
|
|
|
53 $ofile_name=$arguments{"ofile"};
|
|
|
54 open(O,">$ofile_name");
|
|
|
55 $ovcfile=$arguments{"ovcfile"};
|
|
|
56 open(OV,">$ovcfile");
|
|
|
57 $osummary_name=$arguments{"osummary"};
|
|
|
58 open(OS,">$osummary_name");
|
|
|
59 print OS "chr\tstart\tref\talt\tNhom\tNhet\tNind\tGene\tScoreG\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreSP\tScoreTF\tScoremir\tScoreREG\tScoreGWAS\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n";
|
|
|
60
|
|
|
61 $ifile=$arguments{"ifile"};
|
|
|
62 if (-e $ifile)
|
|
|
63 {
|
|
|
64 open(IN,$ifile);
|
|
|
65 while(<IN>)
|
|
|
66 {
|
|
|
67 #($G1,$G2)=(split());
|
|
|
68 #push (@{$interact{$G1}},$G2);
|
|
|
69 }
|
|
|
70 }
|
|
|
71
|
|
|
72
|
|
|
73 $file=$arguments{"vcf"} ;
|
|
|
74 die ("input file $file not found!\n") unless -e $file;
|
|
|
75
|
|
|
76 $lgenes=$arguments{"lgenes"};
|
|
|
77 if (-e $lgenes)
|
|
|
78 {
|
|
|
79 open(IN,$lgenes);
|
|
|
80 while(<IN>)
|
|
|
81 {
|
|
|
82 chomp;
|
|
|
83 $G=(split())[0];
|
|
|
84 $Lgenes{$G}=1;
|
|
|
85 }
|
|
|
86 }
|
|
|
87
|
|
|
88
|
|
|
89 $kfile=$arguments{"keywords"};
|
|
|
90 die ("keyword file $kfile not found!\n") unless -e $kfile;
|
|
|
91 open(IN,$kfile);
|
|
|
92 while(<IN>)
|
|
|
93 {
|
|
|
94 chomp;
|
|
|
95 ($k,$category)=(split())[0,1];
|
|
|
96 $specialKeys{$k}=$category;
|
|
|
97 push (@{$annotKeys{$category}},$k);
|
|
|
98 }
|
|
|
99
|
|
|
100 $diseaseO=$arguments{"disease"} ? $arguments{"disease"} : "GinocchioValgoDellaLavandaiaZoppa";
|
|
|
101 @diseaseO=split(/#/,$diseaseO);
|
|
|
102
|
|
|
103 $sfile=$arguments{"similarD"};
|
|
|
104 if (-e $sfile)
|
|
|
105 {
|
|
|
106 open(IN,$sfile);
|
|
|
107 while(<IN>)
|
|
|
108 {
|
|
|
109 chomp;
|
|
|
110 $Sw=lc($_);
|
|
|
111 $Sw=~s/\s+//;
|
|
|
112 push(@kw,$Sw);
|
|
|
113 }
|
|
|
114 }
|
|
|
115
|
|
|
116 $efile=$arguments{"effects"};
|
|
|
117 die ("effect file $efile not found!\n") unless -e $efile;
|
|
|
118 open(IN,$efile);
|
|
|
119 while(<IN>)
|
|
|
120 {
|
|
|
121 chomp;
|
|
|
122 $effects{$_}=1;
|
|
|
123 }
|
|
|
124
|
|
|
125 $leQTLfile=$arguments{"leQTL"};
|
|
|
126 if (-e $leQTLfile)
|
|
|
127 {
|
|
|
128 open(IN,$leQTLfile);
|
|
|
129 while(<IN>)
|
|
|
130 {
|
|
|
131 chomp();
|
|
|
132 $Qlist{$_}=1;
|
|
|
133 }
|
|
|
134 }
|
|
|
135
|
|
|
136 $disease_clinvar=$arguments{"disease_clinvar"};
|
|
|
137 $score_AF=$arguments{"score_AF"};
|
|
|
138 $score_functional=$arguments{"score_functional"};
|
|
|
139 $score_NS=$arguments{"score_NS"};
|
|
|
140 $score_nIND=$arguments{"score_nIND"};
|
|
|
141 $score_QTL=$arguments{"scoreeQTL"};
|
|
|
142 $scoreG=$arguments{"scoreG"};
|
|
|
143 $scoreM=$arguments{"scoreM"};
|
|
|
144 $scoreT=$arguments{"scoreT"};
|
|
|
145 $scoreR=$arguments{"scoreR"};
|
|
|
146 $scoreGW=$arguments{"scoreGW"};
|
|
|
147 $scoreSP=$arguments{"scoreSP"};
|
|
|
148
|
|
|
149 #print O "CHR\tstart\tgene\tref\talt\tAC\tNhom\tNhet\tNind\tGene\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n";
|
|
|
150 print O "CHR\tstart\tgene\tref\talt\tAC\t";
|
|
|
151
|
|
|
152 foreach $k (sort keys %specialKeys)
|
|
|
153 {
|
|
|
154 print O "$k\t";
|
|
|
155 }
|
|
|
156 print O "VINYL_score\n";
|
|
|
157
|
|
|
158
|
|
|
159 open(IN,$file);
|
|
|
160
|
|
|
161 #print "$gene_score $disease_HGMD $disease_clinvar $score_functional $score_AF $score_NS $score_eQTL $score_nIND\n";
|
|
|
162 while(<IN>)
|
|
|
163 {
|
|
|
164 if ($_=~/^#/)
|
|
|
165 {
|
|
|
166 print OV;
|
|
|
167 next;
|
|
|
168 }
|
|
|
169 chomp();
|
|
|
170 $summary_line="";
|
|
|
171 @val=(split(/\t/));
|
|
|
172 $chr=$val[0];
|
|
|
173 $start=$val[1];
|
|
|
174 $pstart=$val[2];
|
|
|
175 $b1=$val[3];
|
|
|
176 $b2=$val[4];
|
|
|
177 $qscore=$val[5];
|
|
|
178 $pb2=$val[6];
|
|
|
179 $gene="na";
|
|
|
180 $annot=$val[7];
|
|
|
181 $gt=$val[8];
|
|
|
182 @samples=@val[9..$#val];
|
|
|
183 $Nhom=0;
|
|
|
184 $Nhet=0;
|
|
|
185 $summary_line.="$chr\t$start\t$b1\t$b2\t";
|
|
|
186 foreach $s (@samples)
|
|
|
187 {
|
|
|
188 $sid=(split(/\:/,$s))[0];
|
|
|
189 next if $sid eq ".";
|
|
|
190 if ($sid=~/\|/)
|
|
|
191 {
|
|
|
192 ($h1,$h2)=(split(/\|/,$sid))[0,1];
|
|
|
193 }elsif($sid=~/\//){
|
|
|
194 ($h1,$h2)=(split(/\//,$sid))[0,1];
|
|
|
195 }
|
|
|
196 $Nhom++ if $h1==$h2 && $h1 !=0;
|
|
|
197 $Nhet++ if $h1!=$h2 && ($h1!=0 && $h2!=0) ;
|
|
|
198 }
|
|
|
199 $summary_line.="$Nhom\t$Nhet\t";
|
|
|
200 $samples=(join("\t",@samples));
|
|
|
201 @terms=(split(/\;/,$annot));
|
|
|
202 $DIS=0;
|
|
|
203 if ($_=~/;AC=(\d+);/ || $_=~/\tAC=(\d+);/)
|
|
|
204 {
|
|
|
205 $nind=$1;
|
|
|
206 }
|
|
|
207 if ($_=~/Gene.refGene=(\w+);/)
|
|
|
208 {
|
|
|
209 $gene=$1;
|
|
|
210 }
|
|
|
211 $summary_line.="$nind\t$gene\t";
|
|
|
212 next if $nind==0;
|
|
|
213 $gene=(split(/\,/,$gene))[0];
|
|
|
214 $i=0;
|
|
|
215 %keep=();
|
|
|
216 $score=0;
|
|
|
217 $G=0;
|
|
|
218 if ($Lgenes{$gene})
|
|
|
219 {
|
|
|
220 $score+=$scoreG;
|
|
|
221 $G=$scoreG;
|
|
|
222 }
|
|
|
223 $summary_line.="$G\t";
|
|
|
224 foreach $t (@terms)
|
|
|
225 {
|
|
|
226 next unless $t;
|
|
|
227 ($keep,$value)=(split(/\=/,$t))[0,1];
|
|
|
228 $value="." unless ($value);
|
|
|
229 $keep{$keep}=$value;
|
|
|
230 }
|
|
|
231 if ($keep{"CLNSIG"} ne "."){
|
|
|
232 $scoreO=0;
|
|
|
233 $scoreC=0;
|
|
|
234 $add=0;
|
|
|
235 $add=$disease_clinvar if ($keep{"CLNSIG"} eq "Pathogenic" || $keep{"CLNSIG"} eq "Pathogenic,_other,_risk_factor" || $keep{"CLNSIG"} eq "pathogenic" || $keep{"CLNSIG"} eq "Pathogenic/Likely_pathogenic" );
|
|
|
236 $add=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Likely_pathogenic" || $keep{"CLNSIG"} eq "Conflicting_interpretations_of_pathogenicity" || $keep{"CLNSIG"} eq "likely-pathogenic");
|
|
|
237 $add-=$disease_clinvar/4 if ($keep{"CLNSIG"} eq "Likely_benign" || $keep{"CLNSIG"} eq "Benign/Likely_benign");
|
|
|
238 $add-=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Benign");
|
|
|
239 @diseases=split(/\|/,$keep{"CLNDN"});
|
|
|
240 @databases=split(/\|/,$keep{"CLNDISDB"});
|
|
|
241 for ($i=0;$i<=$#diseases;$i++)
|
|
|
242 {
|
|
|
243 $dis=lc $diseases[$i];
|
|
|
244 $dat=$databases[$i];
|
|
|
245 $MDO=0;
|
|
|
246 foreach $disOL (@diseaseO)
|
|
|
247 {
|
|
|
248 $disOL=lc $disOL;
|
|
|
249 if ($dis=~ /$disOL/)
|
|
|
250 {
|
|
|
251 if ($dat=~/OMIM/)
|
|
|
252 {
|
|
|
253 $scoreO=$add;
|
|
|
254 }else{
|
|
|
255 $scoreC=$add;
|
|
|
256 }
|
|
|
257 $MDO=1;
|
|
|
258 last;
|
|
|
259 }
|
|
|
260 }
|
|
|
261 if ($MDO==0)
|
|
|
262 {
|
|
|
263 foreach $kv (@kw)
|
|
|
264 {
|
|
|
265 if ($dis=~/$kv/)
|
|
|
266 {
|
|
|
267 $DIS=1 if $add>0;
|
|
|
268 if ($dat=~/OMIM/)
|
|
|
269 {
|
|
|
270 $scoreO=$add;
|
|
|
271 }else{
|
|
|
272 $scoreC=$add;
|
|
|
273 }
|
|
|
274 }
|
|
|
275 }
|
|
|
276 }
|
|
|
277 }
|
|
|
278 $score+=$scoreO+$scoreC;
|
|
|
279 $summary_line.="$scoreO\t$scoreC\t";
|
|
|
280 #duplicate this block for KW annotated
|
|
|
281 }else{
|
|
|
282 $summary_line.="0\t0\t";
|
|
|
283 }
|
|
|
284 @AFkeys=@{$annotKeys{"AF"}};
|
|
|
285 $AF=0;
|
|
|
286 foreach $AFK (@AFkeys)
|
|
|
287 {
|
|
|
288 $LOC_af=$keep{$AFK} ? $keep{$AFK} : 0 ;
|
|
|
289 $LOC_af=0 if $LOC_af eq ".";
|
|
|
290 $AF=$LOC_af if $LOC_af>$AF;
|
|
|
291 }
|
|
|
292 $AF=$AF/20 if ($chr eq "chr1" && ($start==228557681 || $start==228527758 || $start==228525823));
|
|
|
293 if ($AF<=$arguments{"AF"}) #0,00002
|
|
|
294 {
|
|
|
295 $score+=$score_AF;
|
|
|
296 $summary_line.="$score_AF\t";
|
|
|
297 }elsif($AF>$arguments{"AF"} && $AF<=$arguments{"AF"}*4){
|
|
|
298 $score+=$score_AF/2;
|
|
|
299 $summary_line.=$score_AF/2 ."\t";
|
|
|
300 }elsif($AF>$arguments{"AF"}*4 && $AF<=0.01){
|
|
|
301 $summary_line.="0\t";
|
|
|
302 }elsif($AF>0.01){ #commonSNP
|
|
|
303 $score-=$score_AF/2;
|
|
|
304 $summary_line.=-$score_AF/2 ."\t";
|
|
|
305 }
|
|
|
306
|
|
|
307 @EFFkeys=@{$annotKeys{"Effect"}};
|
|
|
308 $EFFS=0;
|
|
|
309 foreach $EFF (@EFFkeys)
|
|
|
310 {
|
|
|
311 $effectO=(split(/\;/,$keep{$EFF}))[0];
|
|
|
312 $score+=$score_functional if $effects{$effectO};
|
|
|
313 $EFFS+=$score_functional if $effects{$effectO};
|
|
|
314 }
|
|
|
315 $summary_line.="$EFFS\t";
|
|
|
316
|
|
|
317 @SPKeys=@{$annotKeys{"Splice"}};
|
|
|
318 $SPs=0;
|
|
|
319 foreach $SPk (@SPKeys)
|
|
|
320 {
|
|
|
321 next if ($keep{$SPk} eq ".");
|
|
|
322 if($keep{$SPk}>0.6)
|
|
|
323 {
|
|
|
324 $score+=$scoreSP/($#SPKeys+1);
|
|
|
325 $SPs+=$scoreSP/($#SPKeys+1);
|
|
|
326 }
|
|
|
327 }
|
|
|
328 $summary_line.="$SPs\t";
|
|
|
329
|
|
|
330 $TFscore=0;
|
|
|
331 @TFBSkeys=@{$annotKeys{"tfbs"}};
|
|
|
332 foreach $T (@TFBSkeys)
|
|
|
333 {
|
|
|
334 $score+=$scoreT if $keep{$T} ne ".";
|
|
|
335 $TFscore+=$scoreT if $keep{$T} ne ".";
|
|
|
336 }
|
|
|
337 $summary_line.="$TFscore\t";
|
|
|
338
|
|
|
339 $mirscore=0;
|
|
|
340 @mirkeys=@{$annotKeys{"mirna"}};
|
|
|
341 foreach $M (@mirkeys)
|
|
|
342 {
|
|
|
343 $score+=$scoreM if $keep{$M} ne ".";
|
|
|
344 $mirscore+=$scoreM if $keep{$M} ne ".";
|
|
|
345 }
|
|
|
346 $summary_line.="$mirscore\t";
|
|
|
347
|
|
|
348 $REGscore=0;
|
|
|
349 @REGkeys=@{$annotKeys{"Reg"}};
|
|
|
350 foreach $R (@REGkeys)
|
|
|
351 {
|
|
|
352 $score+=$scoreR if $keep{$R} ne ".";
|
|
|
353 $REGscore+=$scoreR if $keep{$R} ne ".";
|
|
|
354 }
|
|
|
355 $summary_line.="$REGscore\t";
|
|
|
356
|
|
|
357 $GWscore=0;
|
|
|
358 if ($keep{"GWAS"} ne ".")
|
|
|
359 {
|
|
|
360 $GW=$keep{"GWAS"};
|
|
|
361 $GW=lc($GW);
|
|
|
362 @Gs=(split(/\|/,$GW));
|
|
|
363 foreach $Gword (@Gs)
|
|
|
364 {
|
|
|
365 next if $Gword eq " ";
|
|
|
366 $Gword=~s/-/ /g;
|
|
|
367 foreach $kv (@kw)
|
|
|
368 {
|
|
|
369 if ($Gword=~/$kv/)
|
|
|
370 {
|
|
|
371 $score+=$scoreGW;
|
|
|
372 $GWscore+=$scoreGW;
|
|
|
373 last;
|
|
|
374 }
|
|
|
375 }
|
|
|
376 }
|
|
|
377
|
|
|
378 }
|
|
|
379 $summary_line.="$GWscore\t";
|
|
|
380
|
|
|
381 $NSa=0;
|
|
|
382 #print "Exon " . $keep{"ExonicFunc.refGene"}. " $score\n"; #modify here to use kwords in conf file
|
|
|
383 if ($keep{"ExonicFunc.refGene"} eq "nonsynonymous_SNV")
|
|
|
384 {
|
|
|
385 $ND=0;
|
|
|
386 @NStools=@{$annotKeys{"NStool"}};
|
|
|
387 foreach $t (@NStools)
|
|
|
388 {
|
|
|
389 $NSscore=$keep{$t};
|
|
|
390 next if $NSscore eq ".";
|
|
|
391 $NSscore="D" if $t=~/CADD/ && $NSscore>=5;
|
|
|
392 $ND++ if $NSscore eq "D";
|
|
|
393 }
|
|
|
394
|
|
|
395 if ($ND==$#NStools+1)
|
|
|
396 {
|
|
|
397 $score+=$score_NS;
|
|
|
398 $NSa+=$score_NS;
|
|
|
399 }elsif($ND>=($#NStools+1)/2){
|
|
|
400 $score+=$score_NS/(($#NStools+1)/2);
|
|
|
401 $NSa+=$score_NS/(($#NStools+1)/2);
|
|
|
402 }elsif($ND==0){
|
|
|
403 $score+=$score_NS/($#NStools+1); #da commentare. o NS/4
|
|
|
404 $NSa+=$score_NS/($#NStools+1);
|
|
|
405 }
|
|
|
406 }
|
|
|
407 $summary_line.="$NSa\t";
|
|
|
408 $iQTL=0;
|
|
|
409 foreach $QTL (keys %Qlist)
|
|
|
410 {
|
|
|
411 next unless $keep{$QTL};
|
|
|
412 if ($keep{$QTL} ne ".")
|
|
|
413 {
|
|
|
414 $score+=$score_QTL;
|
|
|
415 $iQTL+=$score_QTL; #if ($keep{$QTL} ne ".");
|
|
|
416 }
|
|
|
417
|
|
|
418 }
|
|
|
419 $summary_line.="$iQTL\t";
|
|
|
420 if ($nind>=$arguments{"nind"} && $AF<=0.01)
|
|
|
421 {
|
|
|
422 $score+=$score_nIND;
|
|
|
423 $summary_line.="$score_nIND\t";
|
|
|
424 }elsif($nind>=$arguments{"nind"}/2 && $nind<$arguments{"nind"} && $AF<=0.01){
|
|
|
425 $score+=$score_nIND/2;
|
|
|
426 $summary_line.=$score_nIND/2 ."\t";
|
|
|
427 }else{
|
|
|
428 $summary_line.="0\t";
|
|
|
429 }
|
|
|
430 chomp();
|
|
|
431 chop($_);
|
|
|
432 $outL="";
|
|
|
433 $IS_MAL=0;
|
|
|
434 foreach $k (sort keys %specialKeys)
|
|
|
435 {
|
|
|
436 unless($keep{$k})
|
|
|
437 {
|
|
|
438 #warn("Malformed $_\n");
|
|
|
439 warn("please check $k is missing\n");
|
|
|
440 $IS_MAL=1;
|
|
|
441 }
|
|
|
442 $outL.="$keep{$k}\t";
|
|
|
443 }
|
|
|
444 next if $IS_MAL==1;
|
|
|
445 $score=$score/2 if $arguments{"AD"} eq "F" && $Nhom==0;
|
|
|
446 $score=$score/2 if $arguments{"XL"} eq "T" && $chr ne "chrX";
|
|
|
447 $summary_line.="$score\n";
|
|
|
448 print O "$chr\t$start\t$gene\t$b1\t$b2\t$nind\t$outL$score\n"; #if $score>=12 || $DIS==1;
|
|
|
449 print OV "$chr\t$start\t$pstart\t$b1\t$b2\t$qscore\t$pb2\t$annot;VINYL_score=$score\t$gt\t$samples\n";
|
|
|
450 print OS "$summary_line";
|
|
|
451 }
|