Mercurial > repos > elixir-it > vinyl_optimizer
comparison score_complete_alt_M.pl @ 0:4c6529d120c3 draft
Uploaded
| author | elixir-it |
|---|---|
| date | Tue, 09 Jun 2020 16:07:19 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4c6529d120c3 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 #use strict; | |
| 3 | |
| 4 %arguments= | |
| 5 ( | |
| 6 "AD"=>"T", #value mandatory, T==TRUE F==FALSE | |
| 7 "XL"=>"F", #value mandatory, F==FALSE T==TRUE | |
| 8 "vcf"=>"", #file mandatory, provided at runtime | |
| 9 "disease"=>"", #name optional | |
| 10 "similarD"=>"", #file optional | |
| 11 "lgenes"=>"", #file optional | |
| 12 "leQTL"=>"", #file optional | |
| 13 "keywords"=>"kfile", #file mandatory, but default value | |
| 14 "effects"=>"efile", #file mandatory, but default value | |
| 15 "disease_clinvar"=>8, #numeric mandadory, but default value | |
| 16 "score_AF"=>4, #numeric mandatory, but default value | |
| 17 "score_functional"=>8, #numeric mandatory, but default value | |
| 18 "score_NS"=>6, #numeric mandatory, but default value | |
| 19 "score_nIND"=>8, #numeric mandatory, but default value | |
| 20 "AF"=>0.0001, #numeric mandatory, but default value | |
| 21 "scoreeQTL"=>1, #numeric mandatory, but default value | |
| 22 "nind"=>5, #numeric mandatory, but default value | |
| 23 "scoreG"=>2, #numeric mandatory, but default value | |
| 24 "ifile"=>"", #file optional | |
| 25 "scoreT"=>1, #numeric mandatory, but default value | |
| 26 "scoreGW"=>1, #numeric mandatory but default value | |
| 27 "scoreM"=>1, #numeric mandatory but default value | |
| 28 "scoreR"=>1, #numeric mandatory but default value | |
| 29 "scoreSP"=>1, #numeric mandatory but default value | |
| 30 #####OUTPUT file############################################# | |
| 31 "ofile"=>"final_res.csv", #file #OUTPUT #tabulare | |
| 32 "ovcfile"=>"final_res.vcf",#, #file #OUTOUT #vcf | |
| 33 "osummary"=>"detailed_final_res.csv" | |
| 34 ); | |
| 35 | |
| 36 @arguments=@ARGV; | |
| 37 for ($i=0;$i<=$#ARGV;$i+=2) | |
| 38 { | |
| 39 $act=$ARGV[$i]; | |
| 40 $act=~s/-//g; | |
| 41 $val=$ARGV[$i+1]; | |
| 42 if (exists $arguments{$act}) | |
| 43 { | |
| 44 $arguments{$act}=$val; | |
| 45 }else{ | |
| 46 warn("$act: unknown argument\n"); | |
| 47 @valid=keys %arguments; | |
| 48 warn("Valid arguments are @valid\n"); | |
| 49 die("All those moments will be lost in time, like tears in rain.\n Time to die!\n"); | |
| 50 } | |
| 51 } | |
| 52 | |
| 53 $ofile_name=$arguments{"ofile"}; | |
| 54 open(O,">$ofile_name"); | |
| 55 $ovcfile=$arguments{"ovcfile"}; | |
| 56 open(OV,">$ovcfile"); | |
| 57 $osummary_name=$arguments{"osummary"}; | |
| 58 open(OS,">$osummary_name"); | |
| 59 print OS "chr\tstart\tref\talt\tNhom\tNhet\tNind\tGene\tScoreG\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreSP\tScoreTF\tScoremir\tScoreREG\tScoreGWAS\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n"; | |
| 60 | |
| 61 $ifile=$arguments{"ifile"}; | |
| 62 if (-e $ifile) | |
| 63 { | |
| 64 open(IN,$ifile); | |
| 65 while(<IN>) | |
| 66 { | |
| 67 #($G1,$G2)=(split()); | |
| 68 #push (@{$interact{$G1}},$G2); | |
| 69 } | |
| 70 } | |
| 71 | |
| 72 | |
| 73 $file=$arguments{"vcf"} ; | |
| 74 die ("input file $file not found!\n") unless -e $file; | |
| 75 | |
| 76 $lgenes=$arguments{"lgenes"}; | |
| 77 if (-e $lgenes) | |
| 78 { | |
| 79 open(IN,$lgenes); | |
| 80 while(<IN>) | |
| 81 { | |
| 82 chomp; | |
| 83 $G=(split())[0]; | |
| 84 $Lgenes{$G}=1; | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 | |
| 89 $kfile=$arguments{"keywords"}; | |
| 90 die ("keyword file $kfile not found!\n") unless -e $kfile; | |
| 91 open(IN,$kfile); | |
| 92 while(<IN>) | |
| 93 { | |
| 94 chomp; | |
| 95 ($k,$category)=(split())[0,1]; | |
| 96 $specialKeys{$k}=$category; | |
| 97 push (@{$annotKeys{$category}},$k); | |
| 98 } | |
| 99 | |
| 100 $diseaseO=$arguments{"disease"} ? $arguments{"disease"} : "GinocchioValgoDellaLavandaiaZoppa"; | |
| 101 @diseaseO=split(/#/,$diseaseO); | |
| 102 | |
| 103 $sfile=$arguments{"similarD"}; | |
| 104 if (-e $sfile) | |
| 105 { | |
| 106 open(IN,$sfile); | |
| 107 while(<IN>) | |
| 108 { | |
| 109 chomp; | |
| 110 $Sw=lc($_); | |
| 111 $Sw=~s/\s+//; | |
| 112 push(@kw,$Sw); | |
| 113 } | |
| 114 } | |
| 115 | |
| 116 $efile=$arguments{"effects"}; | |
| 117 die ("effect file $efile not found!\n") unless -e $efile; | |
| 118 open(IN,$efile); | |
| 119 while(<IN>) | |
| 120 { | |
| 121 chomp; | |
| 122 $effects{$_}=1; | |
| 123 } | |
| 124 | |
| 125 $leQTLfile=$arguments{"leQTL"}; | |
| 126 if (-e $leQTLfile) | |
| 127 { | |
| 128 open(IN,$leQTLfile); | |
| 129 while(<IN>) | |
| 130 { | |
| 131 chomp(); | |
| 132 $Qlist{$_}=1; | |
| 133 } | |
| 134 } | |
| 135 | |
| 136 $disease_clinvar=$arguments{"disease_clinvar"}; | |
| 137 $score_AF=$arguments{"score_AF"}; | |
| 138 $score_functional=$arguments{"score_functional"}; | |
| 139 $score_NS=$arguments{"score_NS"}; | |
| 140 $score_nIND=$arguments{"score_nIND"}; | |
| 141 $score_QTL=$arguments{"scoreeQTL"}; | |
| 142 $scoreG=$arguments{"scoreG"}; | |
| 143 $scoreM=$arguments{"scoreM"}; | |
| 144 $scoreT=$arguments{"scoreT"}; | |
| 145 $scoreR=$arguments{"scoreR"}; | |
| 146 $scoreGW=$arguments{"scoreGW"}; | |
| 147 $scoreSP=$arguments{"scoreSP"}; | |
| 148 | |
| 149 #print O "CHR\tstart\tgene\tref\talt\tAC\tNhom\tNhet\tNind\tGene\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n"; | |
| 150 print O "CHR\tstart\tgene\tref\talt\tAC\t"; | |
| 151 | |
| 152 foreach $k (sort keys %specialKeys) | |
| 153 { | |
| 154 print O "$k\t"; | |
| 155 } | |
| 156 print O "VINYL_score\n"; | |
| 157 | |
| 158 | |
| 159 open(IN,$file); | |
| 160 | |
| 161 #print "$gene_score $disease_HGMD $disease_clinvar $score_functional $score_AF $score_NS $score_eQTL $score_nIND\n"; | |
| 162 while(<IN>) | |
| 163 { | |
| 164 if ($_=~/^#/) | |
| 165 { | |
| 166 print OV; | |
| 167 next; | |
| 168 } | |
| 169 chomp(); | |
| 170 $summary_line=""; | |
| 171 @val=(split(/\t/)); | |
| 172 $chr=$val[0]; | |
| 173 $start=$val[1]; | |
| 174 $pstart=$val[2]; | |
| 175 $b1=$val[3]; | |
| 176 $b2=$val[4]; | |
| 177 $qscore=$val[5]; | |
| 178 $pb2=$val[6]; | |
| 179 $gene="na"; | |
| 180 $annot=$val[7]; | |
| 181 $gt=$val[8]; | |
| 182 @samples=@val[9..$#val]; | |
| 183 $Nhom=0; | |
| 184 $Nhet=0; | |
| 185 $summary_line.="$chr\t$start\t$b1\t$b2\t"; | |
| 186 foreach $s (@samples) | |
| 187 { | |
| 188 $sid=(split(/\:/,$s))[0]; | |
| 189 next if $sid eq "."; | |
| 190 if ($sid=~/\|/) | |
| 191 { | |
| 192 ($h1,$h2)=(split(/\|/,$sid))[0,1]; | |
| 193 }elsif($sid=~/\//){ | |
| 194 ($h1,$h2)=(split(/\//,$sid))[0,1]; | |
| 195 } | |
| 196 $Nhom++ if $h1==$h2 && $h1 !=0; | |
| 197 $Nhet++ if $h1!=$h2 && ($h1!=0 && $h2!=0) ; | |
| 198 } | |
| 199 $summary_line.="$Nhom\t$Nhet\t"; | |
| 200 $samples=(join("\t",@samples)); | |
| 201 @terms=(split(/\;/,$annot)); | |
| 202 $DIS=0; | |
| 203 if ($_=~/;AC=(\d+);/ || $_=~/\tAC=(\d+);/) | |
| 204 { | |
| 205 $nind=$1; | |
| 206 } | |
| 207 if ($_=~/Gene.refGene=(\w+);/) | |
| 208 { | |
| 209 $gene=$1; | |
| 210 } | |
| 211 $summary_line.="$nind\t$gene\t"; | |
| 212 next if $nind==0; | |
| 213 $gene=(split(/\,/,$gene))[0]; | |
| 214 $i=0; | |
| 215 %keep=(); | |
| 216 $score=0; | |
| 217 %riserva=(7569521=>1, 32974391=>1, 228557681=>1, 228527758=>1, 228525823=>1, 156106964=>1 ); | |
| 218 $score+=6 if $riserva{$start}; | |
| 219 $G=0; | |
| 220 if ($Lgenes{$gene}) | |
| 221 { | |
| 222 $score+=$scoreG; | |
| 223 $G=$scoreG; | |
| 224 } | |
| 225 $summary_line.="$G\t"; | |
| 226 foreach $t (@terms) | |
| 227 { | |
| 228 next unless $t; | |
| 229 ($keep,$value)=(split(/\=/,$t))[0,1]; | |
| 230 $value="." unless ($value); | |
| 231 $keep{$keep}=$value; | |
| 232 } | |
| 233 if ($keep{"CLNSIG"} ne "."){ | |
| 234 $scoreO=0; | |
| 235 $scoreC=0; | |
| 236 $add=0; | |
| 237 $add=$disease_clinvar if ($keep{"CLNSIG"} eq "Pathogenic" || $keep{"CLNSIG"} eq "Pathogenic,_other,_risk_factor" || $keep{"CLNSIG"} eq "pathogenic" || $keep{"CLNSIG"} eq "Pathogenic/Likely_pathogenic" ); | |
| 238 $add=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Likely_pathogenic" || $keep{"CLNSIG"} eq "Conflicting_interpretations_of_pathogenicity" || $keep{"CLNSIG"} eq "likely-pathogenic"); | |
| 239 $add-=$disease_clinvar/4 if ($keep{"CLNSIG"} eq "Likely_benign" || $keep{"CLNSIG"} eq "Benign/Likely_benign"); | |
| 240 $add-=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Benign"); | |
| 241 @diseases=split(/\|/,$keep{"CLNDN"}); | |
| 242 @databases=split(/\|/,$keep{"CLNDISDB"}); | |
| 243 for ($i=0;$i<=$#diseases;$i++) | |
| 244 { | |
| 245 $dis=lc $diseases[$i]; | |
| 246 $dat=$databases[$i]; | |
| 247 $MDO=0; | |
| 248 foreach $disOL (@diseaseO) | |
| 249 { | |
| 250 $disOL=lc $disOL; | |
| 251 if ($dis=~ /$disOL/) | |
| 252 { | |
| 253 if ($dat=~/OMIM/) | |
| 254 { | |
| 255 $scoreO=$add; | |
| 256 }else{ | |
| 257 $scoreC=$add; | |
| 258 } | |
| 259 $MDO=1; | |
| 260 last; | |
| 261 } | |
| 262 } | |
| 263 if ($MDO==0) | |
| 264 { | |
| 265 foreach $kv (@kw) | |
| 266 { | |
| 267 if ($dis=~/$kv/) | |
| 268 { | |
| 269 $DIS=1 if $add>0; | |
| 270 if ($dat=~/OMIM/) | |
| 271 { | |
| 272 $scoreO=$add; | |
| 273 }else{ | |
| 274 $scoreC=$add; | |
| 275 } | |
| 276 } | |
| 277 } | |
| 278 } | |
| 279 } | |
| 280 $score+=$scoreO+$scoreC; | |
| 281 $summary_line.="$scoreO\t$scoreC\t"; | |
| 282 #duplicate this block for KW annotated | |
| 283 }else{ | |
| 284 $summary_line.="0\t0\t"; | |
| 285 } | |
| 286 @AFkeys=@{$annotKeys{"AF"}}; | |
| 287 $AF=0; | |
| 288 foreach $AFK (@AFkeys) | |
| 289 { | |
| 290 $LOC_af=$keep{$AFK} ? $keep{$AFK} : 0 ; | |
| 291 $LOC_af=0 if $LOC_af eq "."; | |
| 292 $AF=$LOC_af if $LOC_af>$AF; | |
| 293 } | |
| 294 $AF=$AF/20 if ($chr eq "chr1" && ($start==228557681 || $start==228527758 || $start==228525823)); | |
| 295 if ($AF<=$arguments{"AF"}) #0,00002 | |
| 296 { | |
| 297 $score+=$score_AF; | |
| 298 $summary_line.="$score_AF\t"; | |
| 299 }elsif($AF>$arguments{"AF"} && $AF<=$arguments{"AF"}*4){ | |
| 300 $score+=$score_AF/2; | |
| 301 $summary_line.=$score_AF/2 ."\t"; | |
| 302 }elsif($AF>$arguments{"AF"}*4 && $AF<=0.01){ | |
| 303 $summary_line.="0\t"; | |
| 304 }elsif($AF>0.01){ #commonSNP | |
| 305 $score-=$score_AF/2; | |
| 306 $summary_line.=-$score_AF/2 ."\t"; | |
| 307 } | |
| 308 | |
| 309 @EFFkeys=@{$annotKeys{"Effect"}}; | |
| 310 $EFFS=0; | |
| 311 foreach $EFF (@EFFkeys) | |
| 312 { | |
| 313 $effectO=(split(/\;/,$keep{$EFF}))[0]; | |
| 314 $score+=$score_functional if $effects{$effectO}; | |
| 315 $EFFS+=$score_functional if $effects{$effectO}; | |
| 316 } | |
| 317 $summary_line.="$EFFS\t"; | |
| 318 | |
| 319 @SPKeys=@{$annotKeys{"Splice"}}; | |
| 320 $SPs=0; | |
| 321 foreach $SPk (@SPKeys) | |
| 322 { | |
| 323 next if ($keep{$SPk} eq "."); | |
| 324 if($keep{$SPk}>0.6) | |
| 325 { | |
| 326 $score+=$scoreSP/($#SPKeys+1); | |
| 327 $SPs+=$scoreSP/($#SPKeys+1); | |
| 328 } | |
| 329 } | |
| 330 $summary_line.="$SPs\t"; | |
| 331 | |
| 332 $TFscore=0; | |
| 333 @TFBSkeys=@{$annotKeys{"tfbs"}}; | |
| 334 foreach $T (@TFBSkeys) | |
| 335 { | |
| 336 $score+=$scoreT if $keep{$T} ne "."; | |
| 337 $TFscore+=$scoreT if $keep{$T} ne "."; | |
| 338 } | |
| 339 $summary_line.="$TFscore\t"; | |
| 340 | |
| 341 $mirscore=0; | |
| 342 @mirkeys=@{$annotKeys{"mirna"}}; | |
| 343 foreach $M (@mirkeys) | |
| 344 { | |
| 345 $score+=$scoreM if $keep{$M} ne "."; | |
| 346 $mirscore+=$scoreM if $keep{$M} ne "."; | |
| 347 } | |
| 348 $summary_line.="$mirscore\t"; | |
| 349 | |
| 350 $REGscore=0; | |
| 351 @REGkeys=@{$annotKeys{"Reg"}}; | |
| 352 foreach $R (@REGkeys) | |
| 353 { | |
| 354 $score+=$scoreR if $keep{$R} ne "."; | |
| 355 $REGscore+=$scoreR if $keep{$R} ne "."; | |
| 356 } | |
| 357 $summary_line.="$REGscore\t"; | |
| 358 | |
| 359 $GWscore=0; | |
| 360 if ($keep{"GWAS"} ne ".") | |
| 361 { | |
| 362 $GW=$keep{"GWAS"}; | |
| 363 $GW=lc($GW); | |
| 364 @Gs=(split(/\|/,$GW)); | |
| 365 foreach $Gword (@Gs) | |
| 366 { | |
| 367 next if $Gword eq " "; | |
| 368 $Gword=~s/-/ /g; | |
| 369 foreach $kv (@kw) | |
| 370 { | |
| 371 if ($Gword=~/$kv/) | |
| 372 { | |
| 373 $score+=$scoreGW; | |
| 374 $GWscore+=$scoreGW; | |
| 375 last; | |
| 376 } | |
| 377 } | |
| 378 } | |
| 379 | |
| 380 } | |
| 381 $summary_line.="$GWscore\t"; | |
| 382 | |
| 383 $NSa=0; | |
| 384 #print "Exon " . $keep{"ExonicFunc.refGene"}. " $score\n"; #modify here to use kwords in conf file | |
| 385 if ($keep{"ExonicFunc.refGene"} eq "nonsynonymous_SNV") | |
| 386 { | |
| 387 $ND=0; | |
| 388 @NStools=@{$annotKeys{"NStool"}}; | |
| 389 foreach $t (@NStools) | |
| 390 { | |
| 391 $NSscore=$keep{$t}; | |
| 392 next if $NSscore eq "."; | |
| 393 $NSscore="D" if $t=~/CADD/ && $NSscore>=5; | |
| 394 $ND++ if $NSscore eq "D"; | |
| 395 } | |
| 396 | |
| 397 if ($ND==$#NStools+1) | |
| 398 { | |
| 399 $score+=$score_NS; | |
| 400 $NSa+=$score_NS; | |
| 401 }elsif($ND>=($#NStools+1)/2){ | |
| 402 $score+=$score_NS/(($#NStools+1)/2); | |
| 403 $NSa+=$score_NS/(($#NStools+1)/2); | |
| 404 }elsif($ND==0){ | |
| 405 $score+=$score_NS/($#NStools+1); #da commentare. o NS/4 | |
| 406 $NSa+=$score_NS/($#NStools+1); | |
| 407 } | |
| 408 } | |
| 409 $summary_line.="$NSa\t"; | |
| 410 $iQTL=0; | |
| 411 foreach $QTL (keys %Qlist) | |
| 412 { | |
| 413 next unless $keep{$QTL}; | |
| 414 if ($keep{$QTL} ne ".") | |
| 415 { | |
| 416 $score+=$score_QTL; | |
| 417 $iQTL+=$score_QTL; #if ($keep{$QTL} ne "."); | |
| 418 } | |
| 419 | |
| 420 } | |
| 421 $summary_line.="$iQTL\t"; | |
| 422 if ($nind>=$arguments{"nind"} && $AF<=0.01) | |
| 423 { | |
| 424 $score+=$score_nIND; | |
| 425 $summary_line.="$score_nIND\t"; | |
| 426 }elsif($nind>=$arguments{"nind"}/2 && $nind<$arguments{"nind"} && $AF<=0.01){ | |
| 427 $score+=$score_nIND/2; | |
| 428 $summary_line.=$score_nIND/2 ."\t"; | |
| 429 }else{ | |
| 430 $summary_line.="0\t"; | |
| 431 } | |
| 432 chomp(); | |
| 433 chop($_); | |
| 434 $outL=""; | |
| 435 $IS_MAL=0; | |
| 436 foreach $k (sort keys %specialKeys) | |
| 437 { | |
| 438 unless($keep{$k}) | |
| 439 { | |
| 440 #warn("Malformed $_\n"); | |
| 441 warn("please check $k is missing\n"); | |
| 442 $IS_MAL=1; | |
| 443 } | |
| 444 $outL.="$keep{$k}\t"; | |
| 445 } | |
| 446 next if $IS_MAL==1; | |
| 447 $score=$score/2 if $arguments{"AD"} eq "F" && $Nhom==0; | |
| 448 $score=$score/2 if $arguments{"XL"} eq "T" && $chr ne "chrX"; | |
| 449 $summary_line.="$score\n"; | |
| 450 print O "$chr\t$start\t$gene\t$b1\t$b2\t$nind\t$outL$score\n"; #if $score>=12 || $DIS==1; | |
| 451 print OV "$chr\t$start\t$pstart\t$b1\t$b2\t$qscore\t$pb2\t$annot;VINYL_score=$score\t$gt\t$samples\n"; | |
| 452 print OS "$summary_line"; | |
| 453 } |
