Mercurial > repos > elixir-it > corgat_funann
comparison FunAnn/annotate.pl @ 0:1886293c2243 draft
Uploaded
| author | elixir-it |
|---|---|
| date | Thu, 23 Jul 2020 12:49:09 +0000 |
| parents | |
| children | 9b8afa0b6da4 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1886293c2243 |
|---|---|
| 1 $fss=13468; | |
| 2 | |
| 3 unless (-e "CorGAT") | |
| 4 | |
| 5 { | |
| 6 system("wget -i https://raw.githubusercontent.com/matteo14c/CorGAT_galaxy/dev/ann.txt"); | |
| 7 system("gzip -d GCF_009858895.2_ASM985889v3_genomic.fna.gz"); | |
| 8 } | |
| 9 | |
| 10 | |
| 11 | |
| 12 | |
| 13 $gen_code="genetic_code"; | |
| 14 die("need genetic code file in the current folder\n") unless -e "genetic_code"; | |
| 15 open(IN,$gen_code); | |
| 16 while(<IN>) | |
| 17 { | |
| 18 ($triplet,$oneL)=(split()); | |
| 19 $code{$triplet}=$oneL; | |
| 20 } | |
| 21 | |
| 22 $genome="GCF_009858895.2_ASM985889v3_genomic.fna"; | |
| 23 die("need reference genome file in the current folder\n") unless -e "GCF_009858895.2_ASM985889v3_genomic.fna"; | |
| 24 open(IN,$genome); | |
| 25 while(<IN>) | |
| 26 { | |
| 27 next if $_=~/^>/; | |
| 28 chomp; | |
| 29 $seq.=$_; | |
| 30 } | |
| 31 $table="annot_table.pl";#"simple_annot_mirror";#"annot_table.pl";#sl5 29728 29768 reg | |
| 32 die("need detailed annotation file for SARS-CoV-2 in the current folder") unless -e "annot_table.pl"; | |
| 33 open(IN,$table); | |
| 34 | |
| 35 | |
| 36 while(<IN>) | |
| 37 { | |
| 38 chomp(); | |
| 39 ($gene,$b1,$b2,$e,$annot,$notes)=(split(/\t/)); | |
| 40 $annot{$b1}{$b2}=[$gene,$e,$annot,$notes]; | |
| 41 #print "$gene $b1 $b2 $e $annot $notes\n"; | |
| 42 $len=$b2-$b1+1; | |
| 43 if ($gene ne "nsp12" && $gene ne "orf1ab") | |
| 44 { | |
| 45 $seqgene=substr($seq,$b1-1,$len+3); #un codone inizio in più e un codone fine in più | |
| 46 }else{ | |
| 47 $len1=$fss-$b1+1; | |
| 48 $part1=substr($seq,$b1-1,$len1); | |
| 49 $len2=$b2-$fss+1; | |
| 50 $part2=substr($seq,$fss-1,$len2+3); | |
| 51 $seqgene="$part1$part2"; | |
| 52 | |
| 53 } | |
| 54 $annot_seq{$gene}=$seqgene; | |
| 55 $Lgenes{$gene}=length($seqgene); | |
| 56 ($NSTOP_G,$Tseq,$pos_Stop_R)=translate($seqgene,\%code); | |
| 57 @seq_res=split('',$Tseq); | |
| 58 for ($i=0;$i<=$#seq_res;$i++) | |
| 59 { | |
| 60 $pos=$i+1; | |
| 61 $res=$seq_res[$i]; | |
| 62 } | |
| 63 } | |
| 64 %AF_data=%{read_simple_table("af_data_new.csv")}; | |
| 65 %MFE_data=%{read_simple_table("MFE_annot.csv")}; | |
| 66 %epi_data=%{read_epitopes("epitopes_annot.csv")}; | |
| 67 %hyphy_data=%{read_hyphy("hyphy_novel.csv")}; | |
| 68 | |
| 69 | |
| 70 | |
| 71 $var_File=shift;#""cl7.csv";#"phenetic_indels_sars_cov2.csv"; | |
| 72 $out_File=shift; | |
| 73 open(OUT,">$out_File"); | |
| 74 die("no input\n") unless -e $var_File; | |
| 75 open(IN,$var_File); | |
| 76 $header=<IN>; | |
| 77 @header=(split(/\s+/,$header)); | |
| 78 print OUT "POS\tREF\tALT\tannot\tAF\tEpitopes\tHyphy\tMFE\n"; | |
| 79 while(<IN>) | |
| 80 { | |
| 81 ($change,@pos)=(split()); | |
| 82 next unless $change=~/\|/; | |
| 83 ($pos,$allele)=(split(/_/,$change))[0,1]; | |
| 84 ($ref,$alt)=(split(/\|/,$allele))[0,1]; | |
| 85 $AF=$AF_data{"$pos$ref$alt"} ? $AF_data{"$pos$ref$alt"} : 0; | |
| 86 next if $alt=~/N/; | |
| 87 $annot_string=""; | |
| 88 $contained=0; | |
| 89 $epitope_string=""; | |
| 90 $hyphy_string=""; | |
| 91 $MFE_string= $MFE_data{"$pos$ref$alt"} ? $MFE_data{"$pos$ref$alt"} : "NA"; | |
| 92 #next if $ref eq"." || $alt eq "."; | |
| 93 foreach $b1 (sort{$a<=>$b} keys %annot) | |
| 94 { | |
| 95 foreach $b2 (sort{$a<=>$b} keys %{$annot{$b1}}) | |
| 96 { | |
| 97 if ($pos<=$b2 && $pos>=$b1) | |
| 98 { | |
| 99 $contained=1; | |
| 100 $type=$annot{$b1}{$b2}[1]; | |
| 101 $namegene=$annot{$b1}{$b2}[0]; | |
| 102 if ($type eq "cds") | |
| 103 { | |
| 104 #print "cds"; | |
| 105 @res=annot_CDS($pos,$ref,$alt,$namegene); | |
| 106 $annot_string.=$res[0]; | |
| 107 if ($namegene ne "orf1ab") | |
| 108 { | |
| 109 $hyphy_string.=$res[1]; | |
| 110 $epitope_string.=$res[2]; | |
| 111 } | |
| 112 }else{ | |
| 113 $rel_pos=$pos-$b1+1; | |
| 114 $annot_string.="$namegene:nc.$ref$rel_pos$alt,NA,NA;"; | |
| 115 $epitope_string="NA" if $epitope_string eq ""; | |
| 116 $hyphy_string="NA" if $hyphy_string eq ""; | |
| 117 } | |
| 118 } | |
| 119 } | |
| 120 } | |
| 121 $epitope_string=~s/\s+/;/g; | |
| 122 #$epitope_string="EpiT:$epitope_string" unless $epitope_string eq "NA"; | |
| 123 print OUT "$pos\t$ref\t$alt\t$annot_string\t$AF\t$epitope_string\t$hyphy_string\t$MFE_string\n" #if $contained==1; | |
| 124 } | |
| 125 | |
| 126 sub translate | |
| 127 { | |
| 128 @orig_seq=split('',$_[0]); | |
| 129 %gen_code=%{$_[1]}; | |
| 130 $type=""; | |
| 131 $NSTOP=0; | |
| 132 $Tseq=""; | |
| 133 $pos_Stop=0; | |
| 134 for ($i=0;$i<=$#orig_seq;$i+=3) | |
| 135 { | |
| 136 $AA=join('',@orig_seq[$i..$i+2]); | |
| 137 $res=$gen_code{$AA}; | |
| 138 $pos_Stop=$i if $pos_Stop ==0 && $res eq "*"; | |
| 139 $NSTOP++ if $res eq "*"; | |
| 140 $Tseq.=$res; | |
| 141 } | |
| 142 #print "$seq\n"; | |
| 143 return($NSTOP,$Tseq,$pos_Stop); | |
| 144 } | |
| 145 | |
| 146 sub annot_CDS | |
| 147 { | |
| 148 $pos=$_[0]; | |
| 149 $ref=$_[1]; | |
| 150 $alt=$_[2]; | |
| 151 $namegene=$_[3]; | |
| 152 my $hyphy_string="NA"; | |
| 153 my $epitopes_string="NA"; | |
| 154 #print "$pos $ref $alt $namegene\n"; | |
| 155 $pos_inG=$pos-$b1+1; | |
| 156 $pos_inG++ if $pos >$fss && ($annot{$b1}{$b2}[0] eq "nsp12" || $annot{$b1}{$b2}[0] eq "orf1ab"); | |
| 157 $mod=$pos_inG%3; | |
| 158 $rel_pos=int($pos_inG/3); | |
| 159 $rel_pos++ if $mod !=0; | |
| 160 | |
| 161 if (length($ref)==1 && $ref ne "."&& $alt ne ".") | |
| 162 { | |
| 163 | |
| 164 if ($mod ==1) | |
| 165 { | |
| 166 $triplet=substr($seq,$pos-1,3); | |
| 167 @Bs=split('',$triplet); | |
| 168 die("1\n $triplet b:$Bs[0] r:$ref") unless ($Bs[0] eq $ref); | |
| 169 $Bs[0]=$alt; | |
| 170 }elsif ($mod==2){ | |
| 171 $triplet=substr($seq,$pos-2,3); | |
| 172 @Bs=split('',$triplet); | |
| 173 die("2\n $triplet b:$Bs[1] r:$ref")unless ($Bs[1] eq $ref); | |
| 174 $Bs[1]=$alt; | |
| 175 }elsif ($mod==0){ | |
| 176 $triplet=substr($seq,$pos-3,3); | |
| 177 @Bs=split('',$triplet); | |
| 178 die("3\n $triplet b:$Bs[2] r:$ref")unless ($Bs[2] eq $ref); | |
| 179 $Bs[2]=$alt; | |
| 180 } | |
| 181 #print "$pos_inG $relpos $mod @Bs\n"; | |
| 182 $Atriplet=join("",@Bs); | |
| 183 $A1=$code{$triplet}; | |
| 184 $A2=$code{$Atriplet}; | |
| 185 $eff=$A1 eq $A2 ? "synonymous" : "missense"; | |
| 186 $eff="start_lost" if $eff eq "missense" && $rel_pos ==1; | |
| 187 $eff="stopGain" if $A2 eq "*" && $A1 ne "*"; #stopG | |
| 188 $eff="stopLoss" if $A1 eq "*" && $A2 ne "*"; #stopL | |
| 189 $eff="S" if $A2 eq "*" && $A1 eq "*"; | |
| 190 $hyphy_string=$hyphy_data{"$namegene$rel_pos"} if $hyphy_data{"$namegene$rel_pos"}; | |
| 191 $epitopes_string=join(" ",@{$epi_data{"$namegene$rel_pos$A1"}}) if $epi_data{"$namegene$rel_pos$A1"}; | |
| 192 #return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$mod,$eff;",$hyphy_string,$epitopes_string); | |
| 193 return("$namegene:c.$pos_inG$ref>$alt,p.$A1$rel_pos$A2,$eff;",$hyphy_string,$epitopes_string); | |
| 194 }else{ | |
| 195 $eff=""; | |
| 196 $gene=$annot_seq{$namegene}; | |
| 197 $lgene=length($gene); | |
| 198 $upstream=substr($gene,0,$pos_inG-1); | |
| 199 $change=substr($gene,$pos_inG,length($ref)); | |
| 200 $downstream=substr($gene,$pos_inG+length($ref)-1); | |
| 201 $len=length($alt); | |
| 202 unless ($ref=~/\./) | |
| 203 { | |
| 204 $modSeq="$upstream$alt$downstream"; | |
| 205 }else{ | |
| 206 $modSeq="$upstream$alt$change$downstream"; | |
| 207 } | |
| 208 $modSeq=~s/\.//g; | |
| 209 | |
| 210 ($NSTOP_G,$Tseq_R,$pos_Stop_R)=translate($gene,\%code); | |
| 211 ($NSTOP_R,$Tseq_alt,$pos_Stop_T)=translate($modSeq,\%code); | |
| 212 $CDS_annot_string="."; | |
| 213 #print "$namegene,aa: sr:$NSTOP_G sa:$NSTOP_R psr:$pos_Stop_R psa:$pos_Stop_T\n"; | |
| 214 if ($NSTOP_R>$NSTOP_G && ($ref=~/\./ || $alt=~/\./)) | |
| 215 { | |
| 216 | |
| 217 $eff="frameshift"; | |
| 218 $eff.="Ins" if $ref=~/\./; | |
| 219 $eff.="Del" if $alt=~/\./; | |
| 220 $truncation=$pos_Stop_T/3; | |
| 221 $ref_Seq=substr($Tseq_R,$truncation-1,1); | |
| 222 return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string); | |
| 223 #return("$namegene:$pos_inG,$rel_pos,$mod,Frameshift,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
| 224 }elsif($NSTOP_R==$NSTOP_G && $pos_Stop_T<($pos_Stop_R-$len) ){ | |
| 225 # print "2\n"; | |
| 226 $truncation=$pos_Stop_T/3; | |
| 227 $ref_Seq=substr($Tseq_R,$truncation-1,1); | |
| 228 $eff="Truncating"; | |
| 229 $eff.="Ins" if $ref=~/\./; | |
| 230 $eff.="Del" if $alt=~/\./; | |
| 231 return("$namegene:c.$pos_inG$ref>$alt,p.$ref_Seq$truncation*,$eff;",$hyphy_string,$epitopes_string); | |
| 232 #return("$namegene:$pos_inG,$rel_pos,$mod,Trunc:$truncation,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string="); | |
| 233 }else{ | |
| 234 %used_epitopes=(); | |
| 235 $hyphy_string=""; | |
| 236 $epitopes_string=""; | |
| 237 $pre=""; | |
| 238 $Cref=$ref; | |
| 239 $Calt=$alt; | |
| 240 if ($mod==0) | |
| 241 { | |
| 242 $pre=substr($gene,$pos_inG-3,2); | |
| 243 }elsif($mod==2){ | |
| 244 $pre=substr($gene,$pos_inG-2,1); | |
| 245 } | |
| 246 $Cref="$pre$Cref"; | |
| 247 $Calt="$pre$Calt"; | |
| 248 $post=$pos_inG+length($ref)-1; | |
| 249 while (length($Cref) %3!=0) | |
| 250 { | |
| 251 $Cref.=substr($gene,$post,1); | |
| 252 $Calt.=substr($gene,$post,1); | |
| 253 $post++; | |
| 254 if ($post>length($gene)) | |
| 255 { | |
| 256 $local_gene=substr($gene,0,length($gene)-3); | |
| 257 $eff="TruncatingDel"; | |
| 258 #print "$post\n"; | |
| 259 $copy_pos=$pos_inG; | |
| 260 #print "$pos_inG\n"; | |
| 261 #print length($gene)."\n"; | |
| 262 #die("muoro"); | |
| 263 $Loc_Ref=substr($local_gene,$copy_pos); | |
| 264 $Talt="-"; | |
| 265 while(length($Loc_Ref)%3!=0) | |
| 266 { | |
| 267 $copy_pos--; | |
| 268 $Loc_Ref=substr($local_gene,$copy_pos); | |
| 269 } | |
| 270 $rel_pos=int($copy_pos/3); | |
| 271 $rel_pos++ if $mod !=0; | |
| 272 $Tref=(translate($Loc_Ref,\%code))[1]; | |
| 273 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
| 274 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string) | |
| 275 } | |
| 276 } | |
| 277 if ($alt=~/\./ || $ref=~/\./) | |
| 278 { | |
| 279 $Tref="-"; | |
| 280 $Talt="-"; | |
| 281 $eff="inframe"; | |
| 282 if ($ref=~/\./) | |
| 283 { | |
| 284 $eff.="Ins"; | |
| 285 $Talt=(translate($Calt,\%code))[1]; | |
| 286 } | |
| 287 if ($alt=~/\./) | |
| 288 { | |
| 289 $eff.="Del"; | |
| 290 $Tref=(translate($Cref,\%code))[1]; | |
| 291 } | |
| 292 if ($eff=~/Del/) | |
| 293 { | |
| 294 @Tref=split('',$Ttref); | |
| 295 for ($i=0;$i<=$#Tref;$i++) | |
| 296 { | |
| 297 $cur_res=$Tref[$i]; | |
| 298 $cur_pos=$rel_pos+$i; | |
| 299 if ($epi_data{"$namegene$cur_res$cur_pos"}) | |
| 300 { | |
| 301 $used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1; | |
| 302 } | |
| 303 if ($hyphy_data{"$namegene$curpos"}) | |
| 304 { | |
| 305 $hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";"; | |
| 306 } | |
| 307 } | |
| 308 $Nkeys=keys %used_epitopes; | |
| 309 $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1; | |
| 310 } | |
| 311 $hyphy_string="NA" if $hyphy_string eq ""; | |
| 312 $epitopes_string="NA" if $epitopes_string eq ""; | |
| 313 #print "$Talt\n"; | |
| 314 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
| 315 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
| 316 }else{ | |
| 317 $Tref=(translate($Cref,\%code))[1]; | |
| 318 $Talt=(translate($Calt,\%code))[1]; | |
| 319 $eff="synonymous" if $Tref eq $Talt; | |
| 320 $eff="missense" if $Tref ne $Talt; | |
| 321 $eff="stopGain" if $Talt=~/\*/; | |
| 322 $eff="stopLoss" if $Tref=~/\*/ && !$Talt=~/\*/; | |
| 323 @Tref=split('',$Ttref); | |
| 324 for ($i=0;$i<=$#Tref;$i++) | |
| 325 { | |
| 326 $cur_res=$Tref[$i]; | |
| 327 $cur_pos=$rel_pos+$i; | |
| 328 if ($epi_data{"$namegene$cur_res$cur_pos"}) | |
| 329 { | |
| 330 $used_epitopes{join(' ',@{$epi_data{"$namegene$cur_res$cur_pos"}})}=1; | |
| 331 } | |
| 332 if ($hyphy_data{"$namegene$curpos"}) | |
| 333 { | |
| 334 $hyphy_string.=$hyphy_data{"$namegene$curpos"} . ";"; | |
| 335 } | |
| 336 } | |
| 337 $Nkeys=keys %used_epitopes; | |
| 338 $epitopes_string.=join(";",keys %used_epitopes) if $Nkeys>=1; | |
| 339 $hyphy_string="NA" if $hyphy_string eq ""; | |
| 340 $epitopes_string="NA" if $epitopes_string eq ""; | |
| 341 return("$namegene:c.$pos_inG$ref>$alt,p.$Tref$rel_pos$Talt,$eff;",$hyphy_string,$epitopes_string); | |
| 342 #return("$namegene:$pos_inG,$rel_pos,$mod,$Tref->$Talt,$eff;",$hyphy_string,$epitopes_string);#\t$CDS_annot_string"); | |
| 343 } | |
| 344 } | |
| 345 } | |
| 346 } | |
| 347 | |
| 348 | |
| 349 sub read_simple_table | |
| 350 { | |
| 351 $file=$_[0]; | |
| 352 die ("$file does not exist") unless -e $file; | |
| 353 my %data=(); | |
| 354 open(IN,$file); | |
| 355 while(<IN>) | |
| 356 { | |
| 357 ($pos,$ref,$alt,$annot)=(split()); | |
| 358 if ($annot=~/;/) | |
| 359 { | |
| 360 # $annot=(split(/\;/,$annot))[0]; | |
| 361 } | |
| 362 $data{"$pos$ref$alt"}=$annot; | |
| 363 } | |
| 364 return \%data; | |
| 365 } | |
| 366 | |
| 367 sub read_epitopes | |
| 368 { | |
| 369 $file=$_[0]; | |
| 370 die("$file does not exist") unless -e $file; | |
| 371 my %data=(); | |
| 372 open(IN,$file); | |
| 373 while(<IN>) | |
| 374 { | |
| 375 ($gene,$pos,$res,$EPIseq,$num,$HLA)=(split); | |
| 376 push(@{$data{"$gene$pos$res"}},"$EPIseq,$num,$HLA"); | |
| 377 } | |
| 378 return \%data; | |
| 379 } | |
| 380 | |
| 381 sub read_hyphy | |
| 382 { | |
| 383 $file=$_[0]; | |
| 384 die("$file does not exist") unless -e $file; | |
| 385 my %data=(); | |
| 386 open(IN,$file); | |
| 387 %kw=("fel"=>1,"meme"=>1,"kind"=>1);#"betas"=>1); | |
| 388 while(<IN>) | |
| 389 { | |
| 390 chomp(); | |
| 391 ($gene,$pos,@annot)=(split(/\,/)); | |
| 392 foreach $a (@annot) | |
| 393 { | |
| 394 ($ref,$key)=(split(/\:/,$a)); | |
| 395 $ref=~s/{//g; | |
| 396 $ref=~s/}//g; | |
| 397 $ref=~s/"//g; | |
| 398 $key=~s/"//g; | |
| 399 #print "$gene $pos $ref $key\n"; | |
| 400 if ($kw{$ref}) | |
| 401 { | |
| 402 $data{"$gene$pos"}.="$ref:$key;"; | |
| 403 } | |
| 404 } | |
| 405 | |
| 406 } | |
| 407 return \%data; | |
| 408 } |
