comparison score_complete_alt_M.pl @ 0:4c6529d120c3 draft

Uploaded
author elixir-it
date Tue, 09 Jun 2020 16:07:19 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4c6529d120c3
1 #!/usr/bin/perl -w
2 #use strict;
3
4 %arguments=
5 (
6 "AD"=>"T", #value mandatory, T==TRUE F==FALSE
7 "XL"=>"F", #value mandatory, F==FALSE T==TRUE
8 "vcf"=>"", #file mandatory, provided at runtime
9 "disease"=>"", #name optional
10 "similarD"=>"", #file optional
11 "lgenes"=>"", #file optional
12 "leQTL"=>"", #file optional
13 "keywords"=>"kfile", #file mandatory, but default value
14 "effects"=>"efile", #file mandatory, but default value
15 "disease_clinvar"=>8, #numeric mandadory, but default value
16 "score_AF"=>4, #numeric mandatory, but default value
17 "score_functional"=>8, #numeric mandatory, but default value
18 "score_NS"=>6, #numeric mandatory, but default value
19 "score_nIND"=>8, #numeric mandatory, but default value
20 "AF"=>0.0001, #numeric mandatory, but default value
21 "scoreeQTL"=>1, #numeric mandatory, but default value
22 "nind"=>5, #numeric mandatory, but default value
23 "scoreG"=>2, #numeric mandatory, but default value
24 "ifile"=>"", #file optional
25 "scoreT"=>1, #numeric mandatory, but default value
26 "scoreGW"=>1, #numeric mandatory but default value
27 "scoreM"=>1, #numeric mandatory but default value
28 "scoreR"=>1, #numeric mandatory but default value
29 "scoreSP"=>1, #numeric mandatory but default value
30 #####OUTPUT file#############################################
31 "ofile"=>"final_res.csv", #file #OUTPUT #tabulare
32 "ovcfile"=>"final_res.vcf",#, #file #OUTOUT #vcf
33 "osummary"=>"detailed_final_res.csv"
34 );
35
36 @arguments=@ARGV;
37 for ($i=0;$i<=$#ARGV;$i+=2)
38 {
39 $act=$ARGV[$i];
40 $act=~s/-//g;
41 $val=$ARGV[$i+1];
42 if (exists $arguments{$act})
43 {
44 $arguments{$act}=$val;
45 }else{
46 warn("$act: unknown argument\n");
47 @valid=keys %arguments;
48 warn("Valid arguments are @valid\n");
49 die("All those moments will be lost in time, like tears in rain.\n Time to die!\n");
50 }
51 }
52
53 $ofile_name=$arguments{"ofile"};
54 open(O,">$ofile_name");
55 $ovcfile=$arguments{"ovcfile"};
56 open(OV,">$ovcfile");
57 $osummary_name=$arguments{"osummary"};
58 open(OS,">$osummary_name");
59 print OS "chr\tstart\tref\talt\tNhom\tNhet\tNind\tGene\tScoreG\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreSP\tScoreTF\tScoremir\tScoreREG\tScoreGWAS\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n";
60
61 $ifile=$arguments{"ifile"};
62 if (-e $ifile)
63 {
64 open(IN,$ifile);
65 while(<IN>)
66 {
67 #($G1,$G2)=(split());
68 #push (@{$interact{$G1}},$G2);
69 }
70 }
71
72
73 $file=$arguments{"vcf"} ;
74 die ("input file $file not found!\n") unless -e $file;
75
76 $lgenes=$arguments{"lgenes"};
77 if (-e $lgenes)
78 {
79 open(IN,$lgenes);
80 while(<IN>)
81 {
82 chomp;
83 $G=(split())[0];
84 $Lgenes{$G}=1;
85 }
86 }
87
88
89 $kfile=$arguments{"keywords"};
90 die ("keyword file $kfile not found!\n") unless -e $kfile;
91 open(IN,$kfile);
92 while(<IN>)
93 {
94 chomp;
95 ($k,$category)=(split())[0,1];
96 $specialKeys{$k}=$category;
97 push (@{$annotKeys{$category}},$k);
98 }
99
100 $diseaseO=$arguments{"disease"} ? $arguments{"disease"} : "GinocchioValgoDellaLavandaiaZoppa";
101 @diseaseO=split(/#/,$diseaseO);
102
103 $sfile=$arguments{"similarD"};
104 if (-e $sfile)
105 {
106 open(IN,$sfile);
107 while(<IN>)
108 {
109 chomp;
110 $Sw=lc($_);
111 $Sw=~s/\s+//;
112 push(@kw,$Sw);
113 }
114 }
115
116 $efile=$arguments{"effects"};
117 die ("effect file $efile not found!\n") unless -e $efile;
118 open(IN,$efile);
119 while(<IN>)
120 {
121 chomp;
122 $effects{$_}=1;
123 }
124
125 $leQTLfile=$arguments{"leQTL"};
126 if (-e $leQTLfile)
127 {
128 open(IN,$leQTLfile);
129 while(<IN>)
130 {
131 chomp();
132 $Qlist{$_}=1;
133 }
134 }
135
136 $disease_clinvar=$arguments{"disease_clinvar"};
137 $score_AF=$arguments{"score_AF"};
138 $score_functional=$arguments{"score_functional"};
139 $score_NS=$arguments{"score_NS"};
140 $score_nIND=$arguments{"score_nIND"};
141 $score_QTL=$arguments{"scoreeQTL"};
142 $scoreG=$arguments{"scoreG"};
143 $scoreM=$arguments{"scoreM"};
144 $scoreT=$arguments{"scoreT"};
145 $scoreR=$arguments{"scoreR"};
146 $scoreGW=$arguments{"scoreGW"};
147 $scoreSP=$arguments{"scoreSP"};
148
149 #print O "CHR\tstart\tgene\tref\talt\tAC\tNhom\tNhet\tNind\tGene\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n";
150 print O "CHR\tstart\tgene\tref\talt\tAC\t";
151
152 foreach $k (sort keys %specialKeys)
153 {
154 print O "$k\t";
155 }
156 print O "VINYL_score\n";
157
158
159 open(IN,$file);
160
161 #print "$gene_score $disease_HGMD $disease_clinvar $score_functional $score_AF $score_NS $score_eQTL $score_nIND\n";
162 while(<IN>)
163 {
164 if ($_=~/^#/)
165 {
166 print OV;
167 next;
168 }
169 chomp();
170 $summary_line="";
171 @val=(split(/\t/));
172 $chr=$val[0];
173 $start=$val[1];
174 $pstart=$val[2];
175 $b1=$val[3];
176 $b2=$val[4];
177 $qscore=$val[5];
178 $pb2=$val[6];
179 $gene="na";
180 $annot=$val[7];
181 $gt=$val[8];
182 @samples=@val[9..$#val];
183 $Nhom=0;
184 $Nhet=0;
185 $summary_line.="$chr\t$start\t$b1\t$b2\t";
186 foreach $s (@samples)
187 {
188 $sid=(split(/\:/,$s))[0];
189 next if $sid eq ".";
190 if ($sid=~/\|/)
191 {
192 ($h1,$h2)=(split(/\|/,$sid))[0,1];
193 }elsif($sid=~/\//){
194 ($h1,$h2)=(split(/\//,$sid))[0,1];
195 }
196 $Nhom++ if $h1==$h2 && $h1 !=0;
197 $Nhet++ if $h1!=$h2 && ($h1!=0 && $h2!=0) ;
198 }
199 $summary_line.="$Nhom\t$Nhet\t";
200 $samples=(join("\t",@samples));
201 @terms=(split(/\;/,$annot));
202 $DIS=0;
203 if ($_=~/;AC=(\d+);/ || $_=~/\tAC=(\d+);/)
204 {
205 $nind=$1;
206 }
207 if ($_=~/Gene.refGene=(\w+);/)
208 {
209 $gene=$1;
210 }
211 $summary_line.="$nind\t$gene\t";
212 next if $nind==0;
213 $gene=(split(/\,/,$gene))[0];
214 $i=0;
215 %keep=();
216 $score=0;
217 %riserva=(7569521=>1, 32974391=>1, 228557681=>1, 228527758=>1, 228525823=>1, 156106964=>1 );
218 $score+=6 if $riserva{$start};
219 $G=0;
220 if ($Lgenes{$gene})
221 {
222 $score+=$scoreG;
223 $G=$scoreG;
224 }
225 $summary_line.="$G\t";
226 foreach $t (@terms)
227 {
228 next unless $t;
229 ($keep,$value)=(split(/\=/,$t))[0,1];
230 $value="." unless ($value);
231 $keep{$keep}=$value;
232 }
233 if ($keep{"CLNSIG"} ne "."){
234 $scoreO=0;
235 $scoreC=0;
236 $add=0;
237 $add=$disease_clinvar if ($keep{"CLNSIG"} eq "Pathogenic" || $keep{"CLNSIG"} eq "Pathogenic,_other,_risk_factor" || $keep{"CLNSIG"} eq "pathogenic" || $keep{"CLNSIG"} eq "Pathogenic/Likely_pathogenic" );
238 $add=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Likely_pathogenic" || $keep{"CLNSIG"} eq "Conflicting_interpretations_of_pathogenicity" || $keep{"CLNSIG"} eq "likely-pathogenic");
239 $add-=$disease_clinvar/4 if ($keep{"CLNSIG"} eq "Likely_benign" || $keep{"CLNSIG"} eq "Benign/Likely_benign");
240 $add-=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Benign");
241 @diseases=split(/\|/,$keep{"CLNDN"});
242 @databases=split(/\|/,$keep{"CLNDISDB"});
243 for ($i=0;$i<=$#diseases;$i++)
244 {
245 $dis=lc $diseases[$i];
246 $dat=$databases[$i];
247 $MDO=0;
248 foreach $disOL (@diseaseO)
249 {
250 $disOL=lc $disOL;
251 if ($dis=~ /$disOL/)
252 {
253 if ($dat=~/OMIM/)
254 {
255 $scoreO=$add;
256 }else{
257 $scoreC=$add;
258 }
259 $MDO=1;
260 last;
261 }
262 }
263 if ($MDO==0)
264 {
265 foreach $kv (@kw)
266 {
267 if ($dis=~/$kv/)
268 {
269 $DIS=1 if $add>0;
270 if ($dat=~/OMIM/)
271 {
272 $scoreO=$add;
273 }else{
274 $scoreC=$add;
275 }
276 }
277 }
278 }
279 }
280 $score+=$scoreO+$scoreC;
281 $summary_line.="$scoreO\t$scoreC\t";
282 #duplicate this block for KW annotated
283 }else{
284 $summary_line.="0\t0\t";
285 }
286 @AFkeys=@{$annotKeys{"AF"}};
287 $AF=0;
288 foreach $AFK (@AFkeys)
289 {
290 $LOC_af=$keep{$AFK} ? $keep{$AFK} : 0 ;
291 $LOC_af=0 if $LOC_af eq ".";
292 $AF=$LOC_af if $LOC_af>$AF;
293 }
294 $AF=$AF/20 if ($chr eq "chr1" && ($start==228557681 || $start==228527758 || $start==228525823));
295 if ($AF<=$arguments{"AF"}) #0,00002
296 {
297 $score+=$score_AF;
298 $summary_line.="$score_AF\t";
299 }elsif($AF>$arguments{"AF"} && $AF<=$arguments{"AF"}*4){
300 $score+=$score_AF/2;
301 $summary_line.=$score_AF/2 ."\t";
302 }elsif($AF>$arguments{"AF"}*4 && $AF<=0.01){
303 $summary_line.="0\t";
304 }elsif($AF>0.01){ #commonSNP
305 $score-=$score_AF/2;
306 $summary_line.=-$score_AF/2 ."\t";
307 }
308
309 @EFFkeys=@{$annotKeys{"Effect"}};
310 $EFFS=0;
311 foreach $EFF (@EFFkeys)
312 {
313 $effectO=(split(/\;/,$keep{$EFF}))[0];
314 $score+=$score_functional if $effects{$effectO};
315 $EFFS+=$score_functional if $effects{$effectO};
316 }
317 $summary_line.="$EFFS\t";
318
319 @SPKeys=@{$annotKeys{"Splice"}};
320 $SPs=0;
321 foreach $SPk (@SPKeys)
322 {
323 next if ($keep{$SPk} eq ".");
324 if($keep{$SPk}>0.6)
325 {
326 $score+=$scoreSP/($#SPKeys+1);
327 $SPs+=$scoreSP/($#SPKeys+1);
328 }
329 }
330 $summary_line.="$SPs\t";
331
332 $TFscore=0;
333 @TFBSkeys=@{$annotKeys{"tfbs"}};
334 foreach $T (@TFBSkeys)
335 {
336 $score+=$scoreT if $keep{$T} ne ".";
337 $TFscore+=$scoreT if $keep{$T} ne ".";
338 }
339 $summary_line.="$TFscore\t";
340
341 $mirscore=0;
342 @mirkeys=@{$annotKeys{"mirna"}};
343 foreach $M (@mirkeys)
344 {
345 $score+=$scoreM if $keep{$M} ne ".";
346 $mirscore+=$scoreM if $keep{$M} ne ".";
347 }
348 $summary_line.="$mirscore\t";
349
350 $REGscore=0;
351 @REGkeys=@{$annotKeys{"Reg"}};
352 foreach $R (@REGkeys)
353 {
354 $score+=$scoreR if $keep{$R} ne ".";
355 $REGscore+=$scoreR if $keep{$R} ne ".";
356 }
357 $summary_line.="$REGscore\t";
358
359 $GWscore=0;
360 if ($keep{"GWAS"} ne ".")
361 {
362 $GW=$keep{"GWAS"};
363 $GW=lc($GW);
364 @Gs=(split(/\|/,$GW));
365 foreach $Gword (@Gs)
366 {
367 next if $Gword eq " ";
368 $Gword=~s/-/ /g;
369 foreach $kv (@kw)
370 {
371 if ($Gword=~/$kv/)
372 {
373 $score+=$scoreGW;
374 $GWscore+=$scoreGW;
375 last;
376 }
377 }
378 }
379
380 }
381 $summary_line.="$GWscore\t";
382
383 $NSa=0;
384 #print "Exon " . $keep{"ExonicFunc.refGene"}. " $score\n"; #modify here to use kwords in conf file
385 if ($keep{"ExonicFunc.refGene"} eq "nonsynonymous_SNV")
386 {
387 $ND=0;
388 @NStools=@{$annotKeys{"NStool"}};
389 foreach $t (@NStools)
390 {
391 $NSscore=$keep{$t};
392 next if $NSscore eq ".";
393 $NSscore="D" if $t=~/CADD/ && $NSscore>=5;
394 $ND++ if $NSscore eq "D";
395 }
396
397 if ($ND==$#NStools+1)
398 {
399 $score+=$score_NS;
400 $NSa+=$score_NS;
401 }elsif($ND>=($#NStools+1)/2){
402 $score+=$score_NS/(($#NStools+1)/2);
403 $NSa+=$score_NS/(($#NStools+1)/2);
404 }elsif($ND==0){
405 $score+=$score_NS/($#NStools+1); #da commentare. o NS/4
406 $NSa+=$score_NS/($#NStools+1);
407 }
408 }
409 $summary_line.="$NSa\t";
410 $iQTL=0;
411 foreach $QTL (keys %Qlist)
412 {
413 next unless $keep{$QTL};
414 if ($keep{$QTL} ne ".")
415 {
416 $score+=$score_QTL;
417 $iQTL+=$score_QTL; #if ($keep{$QTL} ne ".");
418 }
419
420 }
421 $summary_line.="$iQTL\t";
422 if ($nind>=$arguments{"nind"} && $AF<=0.01)
423 {
424 $score+=$score_nIND;
425 $summary_line.="$score_nIND\t";
426 }elsif($nind>=$arguments{"nind"}/2 && $nind<$arguments{"nind"} && $AF<=0.01){
427 $score+=$score_nIND/2;
428 $summary_line.=$score_nIND/2 ."\t";
429 }else{
430 $summary_line.="0\t";
431 }
432 chomp();
433 chop($_);
434 $outL="";
435 $IS_MAL=0;
436 foreach $k (sort keys %specialKeys)
437 {
438 unless($keep{$k})
439 {
440 #warn("Malformed $_\n");
441 warn("please check $k is missing\n");
442 $IS_MAL=1;
443 }
444 $outL.="$keep{$k}\t";
445 }
446 next if $IS_MAL==1;
447 $score=$score/2 if $arguments{"AD"} eq "F" && $Nhom==0;
448 $score=$score/2 if $arguments{"XL"} eq "T" && $chr ne "chrX";
449 $summary_line.="$score\n";
450 print O "$chr\t$start\t$gene\t$b1\t$b2\t$nind\t$outL$score\n"; #if $score>=12 || $DIS==1;
451 print OV "$chr\t$start\t$pstart\t$b1\t$b2\t$qscore\t$pb2\t$annot;VINYL_score=$score\t$gt\t$samples\n";
452 print OS "$summary_line";
453 }