annotate vcf2maf.pl @ 4:e82c4e48ecd7 draft default tip

Uploaded
author elixir-it
date Thu, 14 Nov 2019 08:54:04 +0000
parents 786c9295d2be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1 #!/usr/bin/env perl
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
2
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
3 # vcf2maf - Convert a VCF into a MAF by mapping each variant to only one of all possible gene isoforms
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
4
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
5 use strict;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
6 use warnings;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
7 use IO::File;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
8 use Getopt::Long qw( GetOptions );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
9 use Pod::Usage qw( pod2usage );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
10 use File::Copy qw( move );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
11 use File::Path qw( mkpath );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
12 use Config;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
13
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
14 # Set any default paths and constants
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
15 my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
16 my ( $vep_path, $vep_data, $vep_forks, $buffer_size, $any_allele ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, 5000, 0 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
17 my ( $ref_fasta, $filter_vcf ) = ( "$ENV{HOME}/.vep/homo_sapiens/91_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz", "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
18 my ( $species, $ncbi_build, $cache_version, $maf_center, $retain_info, $min_hom_vaf, $max_filter_ac ) = ( "homo_sapiens", "GRCh37", "", ".", "", 0.7, 10 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
19 my $perl_bin = $Config{perlpath};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
20
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
21 # Find out if samtools and tabix are properly installed, and warn the user if it's not
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
22 my ( $samtools ) = map{chomp; $_}`which samtools`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
23 ( $samtools and -e $samtools ) or die "ERROR: Please install samtools, and make sure it's in your PATH\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
24 my ( $tabix ) = map{chomp; $_}`which tabix`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
25 ( $tabix and -e $tabix ) or die "ERROR: Please install tabix, and make sure it's in your PATH\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
26
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
27 # Hash to convert 3-letter amino-acid codes to their 1-letter codes
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
28 my %aa3to1 = qw( Ala A Arg R Asn N Asp D Asx B Cys C Glu E Gln Q Glx Z Gly G His H Ile I Leu L
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
29 Lys K Met M Phe F Pro P Ser S Thr T Trp W Tyr Y Val V Xxx X Ter * );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
30
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
31 # Prioritize Sequence Ontology terms in order of severity, as estimated by Ensembl:
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
32 # http://useast.ensembl.org/info/genome/variation/predicted_data.html#consequences
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
33 sub GetEffectPriority {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
34 my ( $effect ) = @_;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
35 $effect = '' unless( defined $effect );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
36 my %effectPriority = (
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
37 'transcript_ablation' => 1, # A feature ablation whereby the deleted region includes a transcript feature
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
38 'exon_loss_variant' => 1, # A sequence variant whereby an exon is lost from the transcript
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
39 'splice_donor_variant' => 2, # A splice variant that changes the 2 base region at the 5' end of an intron
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
40 'splice_acceptor_variant' => 2, # A splice variant that changes the 2 base region at the 3' end of an intron
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
41 'stop_gained' => 3, # A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
42 'frameshift_variant' => 3, # A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
43 'stop_lost' => 3, # A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
44 'start_lost' => 4, # A codon variant that changes at least one base of the canonical start codon
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
45 'initiator_codon_variant' => 4, # A codon variant that changes at least one base of the first codon of a transcript
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
46 'disruptive_inframe_insertion' => 5, # An inframe increase in cds length that inserts one or more codons into the coding sequence within an existing codon
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
47 'disruptive_inframe_deletion' => 5, # An inframe decrease in cds length that deletes bases from the coding sequence starting within an existing codon
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
48 'inframe_insertion' => 5, # An inframe non synonymous variant that inserts bases into the coding sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
49 'inframe_deletion' => 5, # An inframe non synonymous variant that deletes bases from the coding sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
50 'protein_altering_variant' => 5, # A sequence variant which is predicted to change the protein encoded in the coding sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
51 'missense_variant' => 6, # A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
52 'conservative_missense_variant' => 6, # A sequence variant whereby at least one base of a codon is changed resulting in a codon that encodes for a different but similar amino acid. These variants may or may not be deleterious
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
53 'rare_amino_acid_variant' => 6, # A sequence variant whereby at least one base of a codon encoding a rare amino acid is changed, resulting in a different encoded amino acid
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
54 'transcript_amplification' => 7, # A feature amplification of a region containing a transcript
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
55 'splice_region_variant' => 8, # A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
56 'stop_retained_variant' => 9, # A sequence variant where at least one base in the terminator codon is changed, but the terminator remains
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
57 'synonymous_variant' => 9, # A sequence variant where there is no resulting change to the encoded amino acid
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
58 'incomplete_terminal_codon_variant' => 10, # A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
59 'coding_sequence_variant' => 11, # A sequence variant that changes the coding sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
60 'mature_miRNA_variant' => 11, # A transcript variant located with the sequence of the mature miRNA
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
61 'exon_variant' => 11, # A sequence variant that changes exon sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
62 '5_prime_UTR_variant' => 12, # A UTR variant of the 5' UTR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
63 '5_prime_UTR_premature_start_codon_gain_variant' => 12, # snpEff-specific effect, creating a start codon in 5' UTR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
64 '3_prime_UTR_variant' => 12, # A UTR variant of the 3' UTR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
65 'non_coding_exon_variant' => 13, # A sequence variant that changes non-coding exon sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
66 'non_coding_transcript_exon_variant' => 13, # snpEff-specific synonym for non_coding_exon_variant
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
67 'non_coding_transcript_variant' => 14, # A transcript variant of a non coding RNA gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
68 'nc_transcript_variant' => 14, # A transcript variant of a non coding RNA gene (older alias for non_coding_transcript_variant)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
69 'intron_variant' => 14, # A transcript variant occurring within an intron
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
70 'intragenic_variant' => 14, # A variant that occurs within a gene but falls outside of all transcript features. This occurs when alternate transcripts of a gene do not share overlapping sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
71 'INTRAGENIC' => 14, # snpEff-specific synonym of intragenic_variant
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
72 'NMD_transcript_variant' => 15, # A variant in a transcript that is the target of NMD
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
73 'upstream_gene_variant' => 16, # A sequence variant located 5' of a gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
74 'downstream_gene_variant' => 16, # A sequence variant located 3' of a gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
75 'TFBS_ablation' => 17, # A feature ablation whereby the deleted region includes a transcription factor binding site
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
76 'TFBS_amplification' => 17, # A feature amplification of a region containing a transcription factor binding site
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
77 'TF_binding_site_variant' => 17, # A sequence variant located within a transcription factor binding site
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
78 'regulatory_region_ablation' => 17, # A feature ablation whereby the deleted region includes a regulatory region
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
79 'regulatory_region_amplification' => 17, # A feature amplification of a region containing a regulatory region
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
80 'regulatory_region_variant' => 17, # A sequence variant located within a regulatory region
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
81 'regulatory_region' =>17, # snpEff-specific effect that should really be regulatory_region_variant
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
82 'feature_elongation' => 18, # A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
83 'feature_truncation' => 18, # A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
84 'intergenic_variant' => 19, # A sequence variant located in the intergenic region, between genes
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
85 'intergenic_region' => 19, # snpEff-specific effect that should really be intergenic_variant
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
86 '' => 20
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
87 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
88 unless( defined $effectPriority{$effect} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
89 warn "WARNING: Unrecognized effect \"$effect\". Assigning lowest priority!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
90 return 20;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
91 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
92 return $effectPriority{$effect};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
93 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
94
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
95 # Prioritize the transcript biotypes that variants are annotated to, based on disease significance:
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
96 # All possible biotypes are defined here: http://www.gencodegenes.org/gencode_biotypes.html
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
97 sub GetBiotypePriority {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
98 my ( $biotype ) = @_;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
99 $biotype = '' unless( defined $biotype );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
100 my %biotype_priority = (
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
101 'protein_coding' => 1, # Contains an open reading frame (ORF)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
102 'LRG_gene' => 2, # Gene in a "Locus Reference Genomic" region known to have disease-related sequence variations
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
103 'IG_C_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
104 'IG_D_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
105 'IG_J_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
106 'IG_LV_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
107 'IG_V_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
108 'TR_C_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
109 'TR_D_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
110 'TR_J_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
111 'TR_V_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
112 'miRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
113 'snRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
114 'snoRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
115 'ribozyme' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
116 'tRNA' => 3, #Added by Y. Boursin
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
117 'sRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
118 'scaRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
119 'rRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
120 'lincRNA' => 3, # Long, intervening noncoding (linc) RNAs, that can be found in evolutionarily conserved, intergenic regions
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
121 'bidirectional_promoter_lncrna' => 3, # A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
122 'bidirectional_promoter_lncRNA' => 3, # A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
123 'known_ncrna' => 4,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
124 'vaultRNA' => 4, # Short non coding RNA genes that form part of the vault ribonucleoprotein complex
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
125 'macro_lncRNA' => 4, # unspliced lncRNAs that are several kb in size
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
126 'Mt_tRNA' => 4, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
127 'Mt_rRNA' => 4, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
128 'antisense' => 5, # Has transcripts that overlap the genomic span (i.e. exon or introns) of a protein-coding locus on the opposite strand
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
129 'antisense_RNA' => 5, # Alias for antisense (Y. Boursin)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
130 'sense_intronic' => 5, # Long non-coding transcript in introns of a coding gene that does not overlap any exons
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
131 'sense_overlapping' => 5, # Long non-coding transcript that contains a coding gene in its intron on the same strand
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
132 '3prime_overlapping_ncrna' => 5, # Transcripts where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
133 '3prime_overlapping_ncRNA' => 5, # Transcripts where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
134 'misc_RNA' => 5, # Non-coding RNA predicted using sequences from RFAM and miRBase
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
135 'non_coding' => 5, # Transcript which is known from the literature to not be protein coding
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
136 'regulatory_region' => 6, # A region of sequence that is involved in the control of a biological process
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
137 'disrupted_domain' => 6, # Otherwise viable coding region omitted from this alternatively spliced transcript because the splice variation affects a region coding for a protein domain
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
138 'processed_transcript' => 6, # Doesn't contain an ORF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
139 'TEC' => 6, # To be Experimentally Confirmed. This is used for non-spliced EST clusters that have polyA features. This category has been specifically created for the ENCODE project to highlight regions that could indicate the presence of protein coding genes that require experimental validation, either by 5' RACE or RT-PCR to extend the transcripts, or by confirming expression of the putatively-encoded peptide with specific antibodies
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
140 'TF_binding_site' => 7, # A region of a nucleotide molecule that binds a Transcription Factor or Transcription Factor complex
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
141 'CTCF_binding_site' =>7, # A transcription factor binding site with consensus sequence CCGCGNGGNGGCAG, bound by CCCTF-binding factor
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
142 'promoter_flanking_region' => 7, # A region immediately adjacent to a promoter which may or may not contain transcription factor binding sites
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
143 'enhancer' => 7, # A cis-acting sequence that increases the utilization of (some) eukaryotic promoters, and can function in either orientation and in any location (upstream or downstream) relative to the promoter
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
144 'promoter' => 7, # A regulatory_region composed of the TSS(s) and binding sites for TF_complexes of the basal transcription machinery
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
145 'open_chromatin_region' => 7, # A DNA sequence that in the normal state of the chromosome corresponds to an unfolded, un-complexed stretch of double-stranded DNA
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
146 'retained_intron' => 7, # Alternatively spliced transcript believed to contain intronic sequence relative to other, coding, variants
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
147 'nonsense_mediated_decay' => 7, # If the coding sequence (following the appropriate reference) of a transcript finishes >50bp from a downstream splice site then it is tagged as NMD. If the variant does not cover the full reference coding sequence then it is annotated as NMD if NMD is unavoidable i.e. no matter what the exon structure of the missing portion is the transcript will be subject to NMD
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
148 'non_stop_decay' => 7, # Transcripts that have polyA features (including signal) without a prior stop codon in the CDS, i.e. a non-genomic polyA tail attached directly to the CDS without 3' UTR. These transcripts are subject to degradation
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
149 'ambiguous_orf' => 7, # Transcript believed to be protein coding, but with more than one possible open reading frame
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
150 'pseudogene' => 8, # Have homology to proteins but generally suffer from a disrupted coding sequence and an active homologous gene can be found at another locus. Sometimes these entries have an intact coding sequence or an open but truncated ORF, in which case there is other evidence used (for example genomic polyA stretches at the 3' end) to classify them as a pseudogene. Can be further classified as one of the following
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
151 'processed_pseudogene' => 8, # Pseudogene that lack introns and is thought to arise from reverse transcription of mRNA followed by reinsertion of DNA into the genome
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
152 'polymorphic_pseudogene' => 8, # Pseudogene owing to a SNP/DIP but in other individuals/haplotypes/strains the gene is translated
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
153 'retrotransposed' => 8, # Pseudogene owing to a reverse transcribed and re-inserted sequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
154 'translated_processed_pseudogene' => 8, # Pseudogenes that have mass spec data suggesting that they are also translated
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
155 'translated_unprocessed_pseudogene' => 8, # Pseudogenes that have mass spec data suggesting that they are also translated
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
156 'transcribed_processed_pseudogene' => 8, # Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
157 'transcribed_unprocessed_pseudogene' => 8, # Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
158 'transcribed_unitary_pseudogene' => 8, #Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
159 'unitary_pseudogene' => 8, # A species specific unprocessed pseudogene without a parent gene, as it has an active orthologue in another species
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
160 'unprocessed_pseudogene' => 8, # Pseudogene that can contain introns since produced by gene duplication
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
161 'Mt_tRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
162 'tRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
163 'snoRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
164 'snRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
165 'scRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
166 'rRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
167 'misc_RNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
168 'miRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
169 'IG_C_pseudogene' => 8, # Inactivated immunoglobulin gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
170 'IG_D_pseudogene' => 8, # Inactivated immunoglobulin gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
171 'IG_J_pseudogene' => 8, # Inactivated immunoglobulin gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
172 'IG_V_pseudogene' => 8, # Inactivated immunoglobulin gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
173 'TR_J_pseudogene' => 8, # Inactivated immunoglobulin gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
174 'TR_V_pseudogene' => 8, # Inactivated immunoglobulin gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
175 'artifact' => 9, # Used to tag mistakes in the public databases (Ensembl/SwissProt/Trembl)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
176 '' => 10
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
177 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
178 unless( defined $biotype_priority{$biotype} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
179 warn "WARNING: Unrecognized biotype \"$biotype\". Assigning lowest priority!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
180 return 10;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
181 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
182 return $biotype_priority{$biotype};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
183 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
184
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
185 # Check for missing or crappy arguments
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
186 unless( @ARGV and $ARGV[0] =~ m/^-/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
187 pod2usage( -verbose => 0, -message => "$0: Missing or invalid arguments!\n", -exitval => 2 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
188 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
189
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
190 # Parse options and print usage if there is a syntax error, or if usage was explicitly requested
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
191 my ( $man, $help ) = ( 0, 0 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
192 my ( $input_vcf, $output_maf, $tmp_dir, $custom_enst_file );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
193 my ( $vcf_tumor_id, $vcf_normal_id, $remap_chain );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
194 GetOptions(
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
195 'help!' => \$help,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
196 'man!' => \$man,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
197 'input-vcf=s' => \$input_vcf,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
198 'output-maf=s' => \$output_maf,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
199 'tmp-dir=s' => \$tmp_dir,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
200 'tumor-id=s' => \$tumor_id,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
201 'normal-id=s' => \$normal_id,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
202 'vcf-tumor-id=s' => \$vcf_tumor_id,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
203 'vcf-normal-id=s' => \$vcf_normal_id,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
204 'custom-enst=s' => \$custom_enst_file,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
205 'vep-path=s' => \$vep_path,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
206 'vep-data=s' => \$vep_data,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
207 'vep-forks=s' => \$vep_forks,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
208 'buffer-size=i' => \$buffer_size,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
209 'any-allele!' => \$any_allele,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
210 'ref-fasta=s' => \$ref_fasta,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
211 'species=s' => \$species,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
212 'ncbi-build=s' => \$ncbi_build,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
213 'cache-version=s' => \$cache_version,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
214 'maf-center=s' => \$maf_center,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
215 'retain-info=s' => \$retain_info,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
216 'min-hom-vaf=s' => \$min_hom_vaf,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
217 'remap-chain=s' => \$remap_chain,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
218 'filter-vcf=s' => \$filter_vcf,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
219 'max-filter-ac=i' => \$max_filter_ac
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
220 ) or pod2usage( -verbose => 1, -input => \*DATA, -exitval => 2 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
221 pod2usage( -verbose => 1, -input => \*DATA, -exitval => 0 ) if( $help );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
222 pod2usage( -verbose => 2, -input => \*DATA, -exitval => 0 ) if( $man );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
223
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
224 # Check if required arguments are missing or problematic
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
225 ( defined $input_vcf and defined $output_maf ) or die "ERROR: Both input-vcf and output-maf must be defined!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
226 ( -s $input_vcf ) or die "ERROR: Provided --input-vcf is missing or empty: $input_vcf\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
227 ( -s $ref_fasta ) or die "ERROR: Provided --ref-fasta is missing or empty: $ref_fasta\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
228 ( $input_vcf !~ m/\.(gz|bz2|bcf)$/ ) or die "ERROR: Unfortunately, --input-vcf cannot be in a compressed format\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
229
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
230 # Unless specified, assume that the VCF uses the same sample IDs that the MAF will contain
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
231 $vcf_tumor_id = $tumor_id unless( $vcf_tumor_id );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
232 $vcf_normal_id = $normal_id unless( $vcf_normal_id );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
233
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
234 # Load up the custom isoform overrides if provided:
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
235 my %custom_enst;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
236 if( $custom_enst_file ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
237 ( -s $custom_enst_file ) or die "ERROR: Provided --custom-enst file is missing or empty: $custom_enst_file\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
238 %custom_enst = map{chomp; ( $_, 1 )}`grep -v ^# $custom_enst_file | cut -f1`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
239 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
240
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
241 # Create a folder for the intermediate VCFs if user-defined, or default to the input VCF's folder
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
242 if( defined $tmp_dir ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
243 mkpath( $tmp_dir ) unless( -d $tmp_dir );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
244 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
245 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
246 $tmp_dir = substr( $input_vcf, 0, rindex( $input_vcf, "/" )) if( $input_vcf =~ m/\// );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
247 $tmp_dir = "." unless( $tmp_dir ); # In case the input VCF is in the current working directory
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
248 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
249
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
250 # Also figure out the base name of the input VCF, cuz we'll be naming a lot of files based on that
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
251 my $input_name = substr( $input_vcf, rindex( $input_vcf, "/" ) + 1 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
252 $input_name =~ s/(\.vcf)*$//;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
253
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
254 # If the VCF contains SVs, split the breakpoints into separate lines before passing to VEP
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
255 my $split_svs = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
256 my $orig_vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open --input-vcf: $input_vcf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
257 my $split_vcf_fh = IO::File->new( "$tmp_dir/$input_name.split.vcf", "w" ) or die "ERROR: Couldn't open VCF: $tmp_dir/$input_name.split.vcf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
258 while( my $line = $orig_vcf_fh->getline ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
259 # If the file uses Mac OS 9 newlines, quit with an error
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
260 ( $line !~ m/\r$/ ) or die "ERROR: Your VCF uses CR line breaks, which we can't support. Please use LF or CRLF.\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
261
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
262 if( $line =~ m/^#/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
263 $split_vcf_fh->print( $line ); # Write header lines unchanged
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
264 next;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
265 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
266
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
267 chomp( $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
268 my @cols = split( "\t", $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
269 my %info = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, "1" ))} split( /\;/, $cols[7] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
270 if( $info{SVTYPE} ){
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
271 # Remove SVTYPE tag if REF/ALT alleles are defined, or VEP won't report transcript effects
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
272 if( $cols[3]=~m/^[ACGTN]+$/i and $cols[4]=~m/^[ACGTN,]+$/i ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
273 $cols[7]=~s/(SVTYPE=\w+;|;SVTYPE=\w+|SVTYPE=\w+)//;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
274 $split_vcf_fh->print( join( "\t", @cols ), "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
275 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
276 # For legit SVs except insertions, split them into two separate breakpoint events
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
277 elsif( $info{SVTYPE}=~m/^(BND|TRA|DEL|DUP|INV)$/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
278 $split_svs = 1;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
279 # Don't tell VEP it's an SV, by removing the SVTYPE tag
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
280 $cols[7]=~s/(SVTYPE=\w+;|;SVTYPE=\w+|SVTYPE=\w+)//;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
281 # Rename two SV specific INFO keys to something friendlier
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
282 $cols[7]=~s/CT=([35]to[35])/Frame=$1/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
283 $cols[7]=~s/SVMETHOD=([\w.]+)/Method=$1/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
284 $cols[4] = "<" . $info{SVTYPE} . ">";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
285 # Fetch the REF allele at the second breakpoint using samtools faidx
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
286 my $ref2 = `$samtools faidx $ref_fasta $info{CHR2}:$info{END}-$info{END} | grep -v ^\\>`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
287 chomp( $ref2 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
288 $split_vcf_fh->print( join( "\t", $info{CHR2}, $info{END}, $cols[2], ( $ref2 ? $ref2 : $cols[3] ), @cols[4..$#cols] ), "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
289 $split_vcf_fh->print( join( "\t", @cols ), "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
290 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
291 $input_vcf = "$tmp_dir/$input_name.split.vcf";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
292 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
293 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
294 $split_vcf_fh->print( join( "\t", @cols ), "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
295 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
296 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
297 $split_vcf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
298 $orig_vcf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
299
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
300 # Delete the split.vcf created above if we didn't find any variants with the SVTYPE tag
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
301 unlink( "$tmp_dir/$input_name.split.vcf" ) if( $input_vcf ne "$tmp_dir/$input_name.split.vcf" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
302
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
303 # If a liftOver chain was provided, remap and switch the input VCF before annotation
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
304 my ( %remap );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
305 if( $remap_chain ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
306 # Find out if liftOver is properly installed, and warn the user if it's not
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
307 my $liftover = `which liftOver`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
308 chomp( $liftover );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
309 ( $liftover and -e $liftover ) or die "ERROR: Please install liftOver, and make sure it's in your PATH\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
310
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
311 # Make a BED file from the VCF, run liftOver on it, and create a hash mapping old to new loci
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
312 `grep -v ^# $input_vcf | cut -f1,2 | awk '{OFS="\\t"; print \$1,\$2-1,\$2,\$1":"\$2}' > $tmp_dir/$input_name.bed`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
313 %remap = map{chomp; my @c=split("\t"); ($c[3], "$c[0]:$c[2]")}`$liftover $tmp_dir/$input_name.bed $remap_chain /dev/stdout /dev/null 2> /dev/null`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
314 unlink( "$tmp_dir/$input_name.bed" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
315
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
316 # Create a new VCF in the temp folder, with remapped loci on which we'll run annotation
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
317 my $orig_vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open --input-vcf: $input_vcf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
318 my $remap_vcf_fh = IO::File->new( "$tmp_dir/$input_name.remap.vcf", "w" ) or die "ERROR: Couldn't open VCF: $tmp_dir/$input_name.remap.vcf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
319 while( my $line = $orig_vcf_fh->getline ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
320 if( $line =~ m/^#/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
321 $remap_vcf_fh->print( $line ); # Write header lines unchanged
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
322 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
323 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
324 chomp( $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
325 my @cols = split( "\t", $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
326 my $locus = $cols[0] . ":" . $cols[1];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
327 if( defined $remap{$locus} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
328 # Retain original variant under INFO, so we can append it later to the output MAF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
329 $cols[7] = ( !$cols[7] or $cols[7] eq "." ? "" : "$cols[7];" ) . "REMAPPED_POS=" . join( ":", @cols[0,1,3,4] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
330 @cols[0,1] = split( ":", $remap{$locus} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
331 $remap_vcf_fh->print( join( "\t", @cols ), "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
332 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
333 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
334 warn "WARNING: Skipping variant at $locus; Unable to liftOver using $remap_chain\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
335 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
336 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
337 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
338 $remap_vcf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
339 $orig_vcf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
340 $input_vcf = "$tmp_dir/$input_name.remap.vcf";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
341 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
342
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
343 # Before running annotation, let's pull flanking reference bps for each variant to do some checks,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
344 # and we'll also pull out overlapping calls from the filter VCF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
345 my $vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open --input-vcf: $input_vcf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
346 my ( %ref_bps, @ref_regions, %uniq_loci, %uniq_regions, %flanking_bps, %filter_data );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
347 while( my $line = $vcf_fh->getline ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
348 # Skip header lines, and pull variant loci to pass to samtools later
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
349 next if( $line =~ m/^#/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
350 chomp( $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
351 my ( $chr, $pos, undef, $ref ) = split( "\t", $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
352 # Create a region that spans the length of the reference allele and 1bp flanks around it
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
353 my $region = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
354 $ref_bps{$region} = $ref;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
355 push( @ref_regions, $region );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
356 $uniq_regions{$region} = 1;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
357 $uniq_loci{"$chr:$pos-$pos"} = 1;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
358 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
359 $vcf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
360
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
361 # samtools runs faster when passed many loci at a time, but limited to around 125k args, at least
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
362 # on CentOS 6. If there are too many loci, split them into 50k chunks and run separately
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
363 my ( $lines, @regions_split ) = ( "", ());
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
364 my @regions = keys %uniq_regions;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
365 my $chr_prefix_in_use = ( @regions and $regions[0] =~ m/^chr/ ? 1 : 0 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
366 push( @regions_split, [ splice( @regions, 0, 50000 ) ] ) while @regions;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
367 map{ my $region = join( " ", @{$_} ); $lines .= `$samtools faidx $ref_fasta $region` } @regions_split;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
368 foreach my $line ( grep( length, split( ">", $lines ))) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
369 # Carefully split this FASTA entry, properly chomping newlines for long indels
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
370 my ( $region, $bps ) = split( "\n", $line, 2 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
371 $bps =~ s/\r|\n//g;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
372 if( $bps ){
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
373 $bps = uc( $bps );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
374 $flanking_bps{$region} = $bps;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
375 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
376 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
377
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
378 # If flanking_bps is entirely empty, then it's most likely that the user chose the wrong ref-fasta
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
379 # Or it's also possible that an outdated samtools was unable to parse the gzipped FASTA files
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
380 # ::NOTE:: If input had no variants, don't break here, so we can continue to create an empty MAF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
381 ( !@regions or %flanking_bps ) or die "ERROR: You're either using an outdated samtools, or --ref-fasta is not the same genome build as your --input-vcf.";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
382
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
383 # Skip filtering if not handling GRCh37, and filter-vcf is pointing to the default GRCh37 ExAC VCF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
384 if(( $species eq "homo_sapiens" and $ncbi_build eq "GRCh37" and $filter_vcf ) or ( $filter_vcf and $filter_vcf ne "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
385 ( -s $filter_vcf ) or die "ERROR: Provided --filter-vcf is missing or empty: $filter_vcf\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
386 # Query each variant locus on the filter VCF, using tabix, just like we used samtools earlier
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
387 ( $lines, @regions_split ) = ( "", ());
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
388 my @regions = keys %uniq_loci;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
389 push( @regions_split, [ splice( @regions, 0, 50000 ) ] ) while @regions;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
390 # ::NOTE:: chr-prefix removal works safely here because ExAC is limited to 1..22, X, Y
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
391 map{ my $loci = join( " ", map{s/^chr//; $_} @{$_} ); $lines .= `$tabix $filter_vcf $loci` } @regions_split;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
392 foreach my $line ( split( "\n", $lines )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
393 my ( $chr, $pos, undef, $ref, $alt, undef, $filter, $info_line ) = split( "\t", $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
394 # Parse out data from info column, and store it for later, along with REF, ALT, and FILTER
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
395 my $locus = ( $chr_prefix_in_use ? "chr$chr:$pos" : "$chr:$pos" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
396 %{$filter_data{$locus}} = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, "1" ))} split( /\;/, $info_line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
397 $filter_data{$locus}{REF} = $ref;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
398 $filter_data{$locus}{ALT} = $alt;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
399 $filter_data{$locus}{FILTER} = $filter;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
400 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
401 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
402
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
403 # For each variant locus and reference allele in the input VCF, report any problems
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
404 foreach my $region ( @ref_regions ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
405 my $ref = $ref_bps{$region};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
406 my ( $locus ) = map{ my ( $chr, $pos ) = split( ":" ); ++$pos; "$chr:$pos" } split( "-", $region );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
407 if( !defined $flanking_bps{$region} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
408 warn "WARNING: Couldn't retrieve bps around $locus from reference FASTA: $ref_fasta\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
409 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
410 elsif( $flanking_bps{$region} !~ m/^[ACGTN]+$/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
411 warn "WARNING: Retrieved invalid bps " . $flanking_bps{$region} . " around $locus from reference FASTA: $ref_fasta\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
412 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
413 elsif( $ref ne substr( $flanking_bps{$region}, 1, length( $ref ))) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
414 warn "WARNING: Reference allele $ref at $locus doesn't match " .
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
415 substr( $flanking_bps{$region}, 1, length( $ref )) . " (flanking bps: " .
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
416 $flanking_bps{$region} . ") from reference FASTA: $ref_fasta\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
417 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
418 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
419
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
420 # Annotate variants in given VCF to all possible transcripts
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
421 my $output_vcf = ( $remap_chain ? "$tmp_dir/$input_name.remap.vep.vcf" : "$tmp_dir/$input_name.vep.vcf" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
422 # Skip running VEP if an annotated VCF already exists
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
423 unless( -s $output_vcf ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
424 warn "STATUS: Running VEP and writing to: $output_vcf\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
425 # Make sure we can find the VEP script
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
426 my $vep_script = ( -s "$vep_path/vep" ? "$vep_path/vep" : "$vep_path/variant_effect_predictor.pl" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
427 ( -s $vep_script ) or die "ERROR: Cannot find VEP script in path: $vep_path\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
428
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
429 # Contruct VEP command using some default options and run it
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
430 my $vep_cmd = "$perl_bin $vep_script --species $species --assembly $ncbi_build --offline --no_progress --no_stats --buffer_size $buffer_size --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --format vcf --input_file $input_vcf --output_file $output_vcf";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
431 # VEP barks if --fork is set to 1. So don't use this argument unless it's >1
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
432 $vep_cmd .= " --fork $vep_forks" if( $vep_forks > 1 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
433 # Require allele match for co-located variants unless user-rejected or we're using a newer VEP
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
434 $vep_cmd .= " --check_allele" unless( $any_allele or $vep_script =~ m/vep$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
435 # Add --cache-version only if the user specifically asked for a version
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
436 $vep_cmd .= " --cache_version $cache_version" if( $cache_version );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
437 # Add options that only work on human variants
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
438 if( $species eq "homo_sapiens" ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
439 # Slight change in these arguments if using the newer VEP
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
440 $vep_cmd .= " --polyphen b " . ( $vep_script =~ m/vep$/ ? "--af --af_1kg --af_esp --af_gnomad" : "--gmaf --maf_1kg --maf_esp" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
441 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
442 # Add options that work for most species, except a few we know about
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
443 $vep_cmd .= " --regulatory" unless( $species eq "canis_familiaris" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
444
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
445 # Make sure it ran without error codes
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
446 system( $vep_cmd ) == 0 or die "\nERROR: Failed to run the VEP annotator! Command: $vep_cmd\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
447 ( -s $output_vcf ) or warn "WARNING: VEP-annotated VCF file is missing or empty: $output_vcf\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
448 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
449
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
450 # Define default MAF Header (https://wiki.nci.nih.gov/x/eJaPAQ) with our vcf2maf additions
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
451 my @maf_header = qw(
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
452 Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
453 Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
454 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
455 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
456 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
457 Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
458 BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
459 Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
460 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
461
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
462 # Add extra annotation columns to the MAF in a consistent order
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
463 my @ann_cols = qw( Allele Gene Feature Feature_type Consequence cDNA_position CDS_position
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
464 Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE STRAND_VEP SYMBOL
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
465 SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
466 EXON INTRON DOMAINS AF AFR_AF AMR_AF ASN_AF EAS_AF EUR_AF SAS_AF AA_AF EA_AF CLIN_SIG SOMATIC
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
467 PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICK VARIANT_CLASS TSL
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
468 HGVS_OFFSET PHENO MINIMISED ExAC_AF ExAC_AF_AFR ExAC_AF_AMR ExAC_AF_EAS ExAC_AF_FIN ExAC_AF_NFE
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
469 ExAC_AF_OTH ExAC_AF_SAS GENE_PHENO FILTER flanking_bps variant_id variant_qual ExAC_AF_Adj
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
470 ExAC_AC_AN_Adj ExAC_AC_AN ExAC_AC_AN_AFR ExAC_AC_AN_AMR ExAC_AC_AN_EAS ExAC_AC_AN_FIN
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
471 ExAC_AC_AN_NFE ExAC_AC_AN_OTH ExAC_AC_AN_SAS ExAC_FILTER gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
472 gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
473
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
474 my @ann_cols_format; # To store the actual order of VEP data, that may differ between runs
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
475 push( @maf_header, @ann_cols );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
476
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
477 # If the user has INFO fields they want to retain, create additional columns for those
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
478 my @addl_info_cols = ();
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
479 if( $retain_info or $remap_chain or $split_svs ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
480 # But let's not overwrite existing columns with the same name
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
481 my %maf_cols = map{ my $c = lc; ( $c, 1 )} @maf_header;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
482 @addl_info_cols = grep{ my $c = lc; !$maf_cols{$c}} split( ",", $retain_info );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
483 # If a remap-chain was used, add a column to retain the original chr:pos:ref:alt
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
484 push( @addl_info_cols, "REMAPPED_POS" ) if( $remap_chain );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
485 # If we had to split some SVs earlier, add some columns with some useful info about SVs
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
486 push( @addl_info_cols, qw( Fusion Method Frame CONSENSUS )) if( $split_svs );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
487 push( @maf_header, @addl_info_cols );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
488 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
489
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
490 # Locate and load the file mapping ENSG IDs to Entrez IDs
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
491 my ( $script_dir ) = $0 =~ m/^(.*)\/vcf2maf/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
492 $script_dir = "." unless( $script_dir );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
493
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
494 my $entrez_id_file = "$script_dir/data/ensg_to_entrez_id_map_ensembl_feb2014.tsv";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
495 my %entrez_id_map = ();
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
496 if( -s $entrez_id_file ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
497 %entrez_id_map = map{chomp; split("\t")} `grep -hv ^# $entrez_id_file`;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
498 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
499
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
500 # Parse through each variant in the annotated VCF, pull out CSQ/ANN from the INFO column, and choose
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
501 # one transcript per variant whose annotation will be used in the MAF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
502 my $maf_fh = IO::File->new( $output_maf, ">" ) or die "ERROR: Couldn't open --output-maf: $output_maf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
503 $maf_fh->print( "#version 2.4\n" . join( "\t", @maf_header ), "\n" ); # Print MAF header
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
504 ( -s $output_vcf ) or exit; # Warnings on this were printed earlier, but quit here, only after a blank MAF is created
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
505 my $annotated_vcf_fh = IO::File->new( $output_vcf ) or die "ERROR: Couldn't open annotated VCF: $output_vcf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
506 my ( $vcf_tumor_idx, $vcf_normal_idx, %sv_pair );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
507 while( my $line = $annotated_vcf_fh->getline ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
508
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
509 # Parse out the VEP CSQ/ANN format, which seems to differ between runs
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
510 if( $line =~ m/^##INFO=<ID=(CSQ|ANN).*Format: (\S+)">$/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
511 # Use this as the expected column order of VEP annotation, unless we already got it from CSQ
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
512 @ann_cols_format = split( /\|/, $2 ) unless( @ann_cols_format and $1 eq "ANN" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
513 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
514 # Skip all other header lines
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
515 next if( $line =~ m/^##/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
516
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
517 chomp( $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
518 my ( $chrom, $pos, $var_id, $ref, $alt, $var_qual, $filter, $info_line, $format_line, @rest ) = split( "\t", $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
519
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
520 # Set ID, QUAL, and FILTER to "." unless defined and non-empty
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
521 $var_id = "." unless( defined $var_id and $var_id ne "" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
522 $var_qual = "." unless( defined $var_qual and $var_qual ne "" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
523 $filter = "." unless( defined $filter and $filter ne "" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
524
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
525 # If FORMATted genotype fields are available, find the sample with the variant, and matched normal
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
526 if( $line =~ m/^#CHROM/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
527 if( $format_line and scalar( @rest ) > 0 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
528 for( my $i = 0; $i <= $#rest; ++$i ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
529 $vcf_tumor_idx = $i if( $rest[$i] eq $vcf_tumor_id );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
530 $vcf_normal_idx = $i if( $rest[$i] eq $vcf_normal_id );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
531 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
532 ( defined $vcf_tumor_idx ) or warn "WARNING: No genotype column for $vcf_tumor_id in VCF!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
533 ( defined $vcf_normal_idx ) or warn "WARNING: No genotype column for $vcf_normal_id in VCF!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
534 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
535 next;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
536 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
537
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
538 # Parse out the data in the info column, and store into a hash
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
539 my %info = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, "1" ))} split( /\;/, $info_line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
540
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
541 # By default, the variant allele is the first (usually the only) allele listed under ALT. If
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
542 # there are >1 alleles in ALT, choose the first non-REF allele listed under tumor GT, that is
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
543 # also not seen under normal GT. If tumor GT is undefined or ambiguous, choose the tumor allele
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
544 # with the most supporting read depth, if available.
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
545 my @alleles = ( $ref, split( /,/, $alt ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
546 my $var_allele_idx = 1;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
547
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
548 # Parse out info from the normal genotype field
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
549 my ( %nrm_info, @nrm_depths );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
550 if( defined $vcf_normal_idx ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
551 my @format_keys = split( /\:/, $format_line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
552 my $idx = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
553 %nrm_info = map {( $format_keys[$idx++], $_ )} split( /\:/, $rest[$vcf_normal_idx] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
554 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
555
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
556 # Parse out info from the tumor genotype field
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
557 my ( %tum_info, @tum_depths );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
558 if( defined $vcf_tumor_idx ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
559 my @format_keys = split( /\:/, $format_line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
560 my $idx = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
561 %tum_info = map {( $format_keys[$idx++], $_ )} split( /\:/, $rest[$vcf_tumor_idx] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
562
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
563 # If possible, parse the tumor genotype to identify the variant allele
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
564 if( defined $tum_info{GT} and $tum_info{GT} ne "." and $tum_info{GT} ne "./." ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
565 my @tum_gt = split( /[\/|]/, $tum_info{GT} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
566 # Default to the first non-REF allele seen in tumor GT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
567 ( $var_allele_idx ) = grep {$_ ne "0"} @tum_gt;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
568 # If possible, choose the first non-REF tumor allele that is also not in normal GT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
569 if( defined $nrm_info{GT} and $nrm_info{GT} ne "." and $nrm_info{GT} ne "./." ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
570 my %nrm_gt = map {( $_, 1 )} split( /[\/|]/, $nrm_info{GT} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
571 ( $var_allele_idx ) = grep {$_ ne "0" and !$nrm_gt{$_}} @tum_gt;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
572 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
573 # If GT was unhelpful, default to the first ALT allele and set GT to undefined
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
574 if( !defined $var_allele_idx or $var_allele_idx !~ m/^\d+$/ or $var_allele_idx >= scalar( @alleles )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
575 $var_allele_idx = 1;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
576 $tum_info{GT} = "./.";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
577 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
578 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
579
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
580 # Standardize tumor AD and DP based on data in the genotype fields
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
581 FixAlleleDepths( \@alleles, $var_allele_idx, \%tum_info );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
582 @tum_depths = split( ",", $tum_info{AD} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
583
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
584 # If genotype is undefined, use the allele depths collected to choose the major variant allele
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
585 unless( defined $tum_info{GT} and $tum_info{GT} ne '.' and $tum_info{GT} ne "./." ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
586 # The first depth listed belongs to the reference allele. Of the rest, find the largest
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
587 for( my $i = 1; $i <= $#tum_depths; ++$i ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
588 $var_allele_idx = $i if( $tum_depths[$i] and $tum_depths[$i] > $tum_depths[$var_allele_idx] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
589 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
590 $tum_info{GT} = "./.";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
591 if( defined $tum_info{DP} and $tum_info{DP} ne '.' and $tum_info{DP} != 0 and defined $tum_depths[$var_allele_idx] ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
592 my $vaf = $tum_depths[$var_allele_idx] / $tum_info{DP};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
593 $tum_info{GT} = ( $vaf < $min_hom_vaf ? "0/1" : "1/1" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
594 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
595 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
596 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
597
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
598 # Set the variant allele to whatever we selected above
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
599 my $var = $alleles[$var_allele_idx];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
600
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
601 # Standardize normal AD and DP based on data in the genotype fields
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
602 if( defined $vcf_normal_idx ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
603 FixAlleleDepths( \@alleles, $var_allele_idx, \%nrm_info );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
604 @nrm_depths = split( ",", $nrm_info{AD} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
605 $nrm_info{GT} = "./." unless( defined $nrm_info{GT} and $nrm_info{GT} ne '.' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
606 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
607
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
608 # Figure out the appropriate start/stop loci and variant type/allele to report in the MAF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
609 my $start = my $stop = my $var_type = my $inframe = "";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
610 my ( $ref_length, $var_length ) = ( length( $ref ), length( $var ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
611 # Backup the VCF-style position and REF/ALT alleles, so we can use it later
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
612 my ( $vcf_pos, $vcf_ref, $vcf_var ) = ( $pos, $ref, $var );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
613 # Remove any prefixed reference bps from all alleles, using "-" for simple indels
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
614 while( $ref and $var and substr( $ref, 0, 1 ) eq substr( $var, 0, 1 ) and $ref ne $var ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
615 ( $ref, $var, @alleles ) = map{$_ = substr( $_, 1 ); ( $_ ? $_ : "-" )} ( $ref, $var, @alleles );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
616 --$ref_length; --$var_length; ++$pos;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
617 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
618 # Handle SNPs, DNPs, TNPs, or anything larger (ONP)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
619 if( $ref_length == $var_length ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
620 ( $start, $stop ) = ( $pos, $pos + $var_length - 1 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
621 my %np_type = qw( 1 SNP 2 DNP 3 TNP );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
622 $var_type = ( $var_length > 3 ? "ONP" : $np_type{$var_length} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
623 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
624 # Handle all indels, including those complex ones which contain substitutions
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
625 elsif( $ref_length != $var_length ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
626 if( $ref_length < $var_length ) { # Handle insertions, and the special case for complex ones
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
627 ( $start, $stop ) = (( $ref eq "-" ? $pos - 1 : $pos ), ( $ref eq "-" ? $pos : $pos + $ref_length - 1 ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
628 $var_type = "INS";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
629 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
630 else { # Handle deletions
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
631 ( $start, $stop ) = ( $pos, $pos + $ref_length - 1 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
632 $var_type = "DEL";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
633 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
634 $inframe = ( abs( $ref_length - $var_length ) % 3 == 0 ? 1 : 0 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
635 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
636
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
637 my @all_effects; # A list of effects of this variant on all possible transcripts
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
638 my $maf_effect; # A single effect per variant to report in the standard MAF columns
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
639 my %maf_line = map{( $_, '' )} @maf_header; # Initialize MAF fields with blank strings
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
640
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
641 # VEP provides a comma-delimited list of consequences, with pipe-delim details per consequence
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
642 # It replaces ',' in details with '&'. We'll assume that all '&'s we see, were formerly commas
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
643 # "Consequence" might list multiple effects on the same transcript e.g. missense,splice_region
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
644 if( $info{CSQ} or $info{ANN} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
645
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
646 my $ann_lines = ( $info{CSQ} ? $info{CSQ} : $info{ANN} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
647 foreach my $ann_line ( split( /,/, $ann_lines )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
648 my $idx = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
649 my %effect = map{s/\&/,/g; ( $ann_cols_format[$idx++], ( defined $_ ? $_ : '' ))} split( /\|/, $ann_line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
650
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
651 # Remove transcript ID from HGVS codon/protein changes, to make it easier on the eye
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
652 $effect{HGVSc} =~ s/^.*:// if( $effect{HGVSc} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
653 $effect{HGVSp} =~ s/^.*:// if( $effect{HGVSp} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
654
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
655 # Remove the prefixed HGVSc code in HGVSp, if found
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
656 $effect{HGVSp} =~ s/^.*\((p\.\S+)\)/$1/ if( $effect{HGVSp} and $effect{HGVSp} =~ m/^c\./ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
657
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
658 # Sort consequences by decreasing order of severity, and pick the most severe one
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
659 $effect{Consequence} = join( ",", sort { GetEffectPriority($a) <=> GetEffectPriority($b) } split( ",", $effect{Consequence} ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
660 ( $effect{One_Consequence} ) = split( ",", $effect{Consequence} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
661
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
662 # When VEP fails to provide any value in Consequence, tag it as an intergenic variant
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
663 $effect{One_Consequence} = "intergenic_variant" unless( $effect{Consequence} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
664
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
665 # Create a shorter HGVS protein format using 1-letter codes
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
666 if( $effect{HGVSp} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
667 my $hgvs_p_short = $effect{HGVSp};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
668 while( $hgvs_p_short and my ( $find, $replace ) = each %aa3to1 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
669 eval "\$hgvs_p_short =~ s{$find}{$replace}g";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
670 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
671 $effect{HGVSp_Short} = $hgvs_p_short;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
672 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
673
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
674 # Fix HGVSp_Short, CDS_position, and Protein_position for splice acceptor/donor variants
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
675 if( $effect{One_Consequence} =~ m/^(splice_acceptor_variant|splice_donor_variant)$/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
676 my ( $c_pos ) = $effect{HGVSc} =~ m/^c.(\d+)/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
677 if( defined $c_pos ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
678 $c_pos = 1 if( $c_pos < 1 ); # Handle negative cDNA positions used in 5' UTRs
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
679 my $p_pos = sprintf( "%.0f", ( $c_pos + $c_pos % 3 ) / 3 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
680 $effect{HGVSp_Short} = "p.X" . $p_pos . "_splice";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
681 $effect{CDS_position} =~ s/^-(\/\d+)$/$c_pos$1/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
682 $effect{Protein_position} =~ s/^-(\/\d+)$/$p_pos$1/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
683 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
684 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
685
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
686 # Fix HGVSp_Short for Silent mutations, so it mentions the amino-acid and position
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
687 if( defined $effect{HGVSp_Short} and $effect{HGVSp_Short} eq "p.=" ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
688 my ( $p_pos ) = $effect{Protein_position} =~ m/^(\d+)(-\d+)?\/\d+$/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
689 my $aa = $effect{Amino_acids};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
690 $effect{HGVSp_Short} = "p.$aa" . $p_pos . "=";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
691 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
692
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
693 # Copy VEP data into MAF fields that don't share the same identifier
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
694 $effect{Transcript_ID} = $effect{Feature};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
695 $effect{Exon_Number} = $effect{EXON};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
696 $effect{Hugo_Symbol} = ( $effect{SYMBOL} ? $effect{SYMBOL} : '' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
697
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
698 # If AF columns from the older VEP are found, rename to the newer ones for consistency
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
699 my %af_col = qw( GMAF AF AFR_MAF AFR_AF AMR_MAF AMR_AF ASN_MAF ASN_AF EAS_MAF EAS_AF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
700 EUR_MAF EUR_AF SAS_MAF SAS_AF AA_MAF AA_AF EA_MAF EA_AF );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
701 map { $effect{$af_col{$_}} = $effect{$_} if( defined $effect{$_} )} keys %af_col;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
702
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
703 # If VEP couldn't find this variant in dbSNP/COSMIC/etc., we'll say it's "novel"
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
704 if( $effect{Existing_variation} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
705 # ::NOTE:: If seen in a DB other than dbSNP, this field will remain blank
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
706 $effect{dbSNP_RS} = join( ",", grep{m/^rs\d+$/} split( /,/, $effect{Existing_variation} ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
707 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
708 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
709 $effect{dbSNP_RS} = "novel";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
710 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
711
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
712 # Transcript_Length isn't separately reported, but can be parsed out from cDNA_position
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
713 ( $effect{Transcript_Length} ) = $effect{cDNA_position} =~ m/\/(\d+)$/;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
714 $effect{Transcript_Length} = 0 unless( defined $effect{Transcript_Length} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
715
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
716 # Skip effects on other ALT alleles. If ALLELE_NUM is undefined (e.g. for INFO:SVTYPE), don't skip any
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
717 push( @all_effects, \%effect ) unless( $effect{ALLELE_NUM} and $effect{ALLELE_NUM} != $var_allele_idx );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
718 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
719
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
720 # Sort effects first by transcript biotype, then by severity, and then by longest transcript
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
721 @all_effects = sort {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
722 GetBiotypePriority( $a->{BIOTYPE} ) <=> GetBiotypePriority( $b->{BIOTYPE} ) ||
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
723 GetEffectPriority( $a->{One_Consequence} ) <=> GetEffectPriority( $b->{One_Consequence} ) ||
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
724 $b->{Transcript_Length} <=> $a->{Transcript_Length}
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
725 } @all_effects;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
726
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
727 # Find the highest priority effect with a gene symbol (usually the first one)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
728 my ( $effect_with_gene_name ) = grep { $_->{SYMBOL} } @all_effects;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
729 my $maf_gene = $effect_with_gene_name->{SYMBOL} if( $effect_with_gene_name );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
730
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
731 # If the gene has user-defined custom isoform overrides, choose that instead
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
732 ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{Transcript_ID} and $custom_enst{$_->{Transcript_ID}} } @all_effects;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
733
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
734 # Find the effect on the canonical transcript of that highest priority gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
735 ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects unless( $maf_effect );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
736
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
737 # If that gene has no canonical transcript tagged, choose the highest priority canonical effect on any gene
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
738 ( $maf_effect ) = grep { $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects unless( $maf_effect );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
739
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
740 # If none of the effects are tagged as canonical, then just report the top priority effect
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
741 $maf_effect = $all_effects[0] unless( $maf_effect );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
742 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
743
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
744 # Construct the MAF columns from the $maf_effect hash
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
745 %maf_line = map{( $_, ( $maf_effect->{$_} ? $maf_effect->{$_} : '' ))} @maf_header;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
746 $maf_line{Hugo_Symbol} = $maf_effect->{Transcript_ID} unless( $maf_effect->{Hugo_Symbol} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
747 $maf_line{Hugo_Symbol} = 'Unknown' unless( $maf_effect->{Transcript_ID} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
748 $maf_line{Entrez_Gene_Id} = ( defined $entrez_id_map{$maf_effect->{Gene}} ? $entrez_id_map{$maf_effect->{Gene}} : "0" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
749 $maf_line{Center} = $maf_center;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
750 $maf_line{NCBI_Build} = $ncbi_build;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
751 $maf_line{Chromosome} = $chrom;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
752 $maf_line{Start_Position} = $start;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
753 $maf_line{End_Position} = $stop;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
754 $maf_line{Strand} = '+'; # Per MAF definition, only the positive strand is an accepted value
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
755 $maf_line{STRAND_VEP} = $maf_effect->{STRAND}; # Renamed to avoid mixup with "Strand" above
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
756 $maf_line{Variant_Classification} = GetVariantClassification( $maf_effect->{One_Consequence}, $var_type, $inframe );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
757 $maf_line{Variant_Type} = $var_type;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
758 $maf_line{Reference_Allele} = $ref;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
759 # ::NOTE:: If tumor genotype is unavailable, then we'll assume it's ref/var heterozygous
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
760 $maf_line{Tumor_Seq_Allele1} = $ref;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
761 $maf_line{Tumor_Seq_Allele2} = $var;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
762 if( defined $tum_info{GT} and $tum_info{GT} ne "." and $tum_info{GT} ne "./." ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
763 # ::NOTE:: MAF only supports biallelic sites. Tumor_Seq_Allele2 must always be the $var
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
764 # picked earlier. For Tumor_Seq_Allele1, pick the first non-var allele in GT (usually $ref)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
765 my ( $idx1, $idx2 ) = split( /[\/|]/, $tum_info{GT} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
766 # If GT was monoploid, then $idx2 will be undefined, and we should set it equal to $idx1
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
767 $idx2 = $idx1 unless( defined $idx2 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
768 $maf_line{Tumor_Seq_Allele1} = ( $alleles[$idx1] ne $var ? $alleles[$idx1] : $alleles[$idx2] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
769 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
770 # ::NOTE:: If normal genotype is unavailable, then we'll assume it's ref/ref homozygous
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
771 $maf_line{Match_Norm_Seq_Allele1} = $ref;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
772 $maf_line{Match_Norm_Seq_Allele2} = $ref;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
773 if( defined $nrm_info{GT} and $nrm_info{GT} ne "." and $nrm_info{GT} ne "./." ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
774 # ::NOTE:: MAF only supports biallelic sites. So choose the first two alleles listed in GT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
775 my ( $idx1, $idx2 ) = split( /[\/|]/, $nrm_info{GT} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
776 # If GT was monoploid, then $idx2 will be undefined, and we should set it equal to $idx1
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
777 $idx2 = $idx1 unless( defined $idx2 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
778 $maf_line{Match_Norm_Seq_Allele1} = $alleles[$idx1];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
779 $maf_line{Match_Norm_Seq_Allele2} = $alleles[$idx2];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
780 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
781 $maf_line{Tumor_Sample_Barcode} = $tumor_id;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
782 $maf_line{Matched_Norm_Sample_Barcode} = $normal_id;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
783 $maf_line{t_depth} = $tum_info{DP} if( defined $tum_info{DP} and $tum_info{DP} ne "." );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
784 ( $maf_line{t_ref_count}, $maf_line{t_alt_count} ) = @tum_depths[0,$var_allele_idx] if( @tum_depths );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
785 $maf_line{n_depth} = $nrm_info{DP} if( defined $nrm_info{DP} and $nrm_info{DP} ne "." );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
786 ( $maf_line{n_ref_count}, $maf_line{n_alt_count} ) = @nrm_depths[0,$var_allele_idx] if( @nrm_depths );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
787
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
788 # Create a semicolon delimited list summarizing the prioritized effects in @all_effects
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
789 $maf_line{all_effects} = "";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
790 foreach my $effect ( @all_effects ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
791 my $gene_name = $effect->{Hugo_Symbol};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
792 my $effect_type = $effect->{One_Consequence};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
793 my $protein_change = ( $effect->{HGVSp} ? $effect->{HGVSp} : '' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
794 my $transcript_id = ( $effect->{Transcript_ID} ? $effect->{Transcript_ID} : '' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
795 my $refseq_ids = ( $effect->{RefSeq} ? $effect->{RefSeq} : '' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
796 $maf_line{all_effects} .= "$gene_name,$effect_type,$protein_change,$transcript_id,$refseq_ids;" if( $effect_type and $transcript_id );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
797 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
798
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
799 # If this variant was seen in the ExAC VCF, let's report allele counts and frequencies
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
800 # ExAC merges and pads multiallelic sites, so we need to normalize each variant, (remove common
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
801 # suffixed bps) before we compare it to our variant allele
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
802 my $locus = "$chrom:$vcf_pos";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
803 if( defined $filter_data{$locus} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
804 my $idx = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
805 $maf_line{ExAC_FILTER} = $filter_data{$locus}{FILTER};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
806 foreach my $f_var ( split( ",", $filter_data{$locus}{ALT} )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
807 my $f_ref = $filter_data{$locus}{REF};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
808 # De-pad suffixed bps that are identical between ref/var alleles
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
809 while( $f_ref and $f_var and substr( $f_ref, -1, 1 ) eq substr( $f_var, -1, 1 ) and $f_ref ne $f_var ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
810 ( $f_ref, $f_var ) = map{substr( $_, 0, -1 )} ( $f_ref, $f_var );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
811 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
812 # If this normalized variant matches our input variant, report its allele counts
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
813 # ExAC reports MNPs as separate SNPs. So we'll need to report the ACs of the first SNP
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
814 if(( $vcf_ref eq $f_ref and $vcf_var eq $f_var ) or
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
815 (( $var_type eq "DNP" or $var_type eq "TNP" or $var_type eq "ONP") and
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
816 ( $vcf_ref =~ m/^$f_ref/ and $vcf_var =~ m/^$f_var/ ))) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
817 my @var_acs = split( ",", $filter_data{$locus}{AC} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
818 my $var_ac = $var_acs[$idx];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
819 my $pop_an = $filter_data{$locus}{AN};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
820 $maf_line{ExAC_AF} = sprintf( "%.4g", ( $pop_an ? ( $var_ac / $pop_an ) : 0 ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
821 $maf_line{ExAC_AC_AN} = join( "/", $var_ac, $pop_an );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
822 # Do the same for AC/AN in each subpopulation, and the adjusted total AC/AN (Adj)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
823 foreach my $subpop ( qw( AFR AMR EAS FIN NFE OTH SAS Adj )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
824 @var_acs = split( ",", $filter_data{$locus}{"AC_$subpop"} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
825 $var_ac = $var_acs[$idx];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
826 $pop_an = $filter_data{$locus}{"AN_$subpop"};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
827 $maf_line{"ExAC_AF_$subpop"} = sprintf( "%.4g", ( $pop_an ? ( $var_ac / $pop_an ) : 0 ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
828 $maf_line{"ExAC_AC_AN_$subpop"} = join( "/", $var_ac, $pop_an );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
829 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
830 last;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
831 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
832 ++$idx;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
833 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
834 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
835
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
836 # Copy FILTER from input VCF, and tag calls with high allele counts in any ExAC subpopulation
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
837 my $subpop_count = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
838 # Remove existing common_variant tags from input, so it's redefined by our criteria here
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
839 $filter = join( ";", grep{ $_ ne "common_variant" } split( /,|;/, $filter ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
840 foreach my $subpop ( qw( AFR AMR EAS FIN NFE OTH SAS )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
841 if( $maf_line{"ExAC_AC_AN_$subpop"} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
842 my ( $subpop_ac ) = split( "/", $maf_line{"ExAC_AC_AN_$subpop"} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
843 $subpop_count++ if( $subpop_ac > $max_filter_ac );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
844 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
845 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
846 if( $subpop_count > 0 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
847 $filter = (( $filter eq "PASS" or $filter eq "." or !$filter ) ? "common_variant" : "$filter;common_variant" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
848 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
849 $maf_line{FILTER} = $filter;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
850
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
851 # Also add the reference allele flanking bps that we generated earlier with samtools
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
852 my $region = "$chrom:" . ( $vcf_pos - 1 ) . "-" . ( $vcf_pos + length( $vcf_ref ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
853 $maf_line{flanking_bps} = $flanking_bps{$region};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
854
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
855 # Add ID and QUAL from the input VCF into respective MAF columns
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
856 $maf_line{variant_id} = $var_id;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
857 $maf_line{variant_qual} = $var_qual;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
858
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
859 # If there are additional INFO data to add, then add those
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
860 foreach my $info_col ( @addl_info_cols ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
861 $maf_line{$info_col} = ( defined $info{$info_col} ? $info{$info_col} : "" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
862 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
863
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
864 # If this is an SV, pair up gene names from separate lines to backfill the Fusion column later
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
865 if( $split_svs and $var=~m/^<BND|DEL|DUP|INV>$/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
866 my $sv_key = "$var_id-$tumor_id";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
867 if( $sv_pair{$sv_key} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
868 $sv_pair{$sv_key} = $sv_pair{$sv_key} . "-" . $maf_line{Hugo_Symbol} . " fusion";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
869 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
870 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
871 $sv_pair{$sv_key} = $maf_line{Hugo_Symbol};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
872 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
873 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
874
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
875 # At this point, we've generated all we can about this variant, so write it to the MAF
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
876 $maf_fh->print( join( "\t", map{( defined $maf_line{$_} ? $maf_line{$_} : "" )} @maf_header ) . "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
877 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
878 $maf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
879 $annotated_vcf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
880
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
881 # If the MAF lists SVs, backfill the Fusion column with gene-pair names
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
882 if( $split_svs ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
883 my $output_name = substr( $output_maf, rindex( $output_maf, "/" ) + 1 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
884 $output_name =~ s/(\.maf)*$//;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
885 my $tmp_output_maf = "$tmp_dir/$output_name.tmp.maf";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
886
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
887 my $in_maf_fh = IO::File->new( $output_maf ) or die "ERROR: Couldn't open: $output_maf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
888 my $out_maf_fh = IO::File->new( $tmp_output_maf, ">" ) or die "ERROR: Couldn't open: $tmp_output_maf!\n";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
889 my ( $tid_idx, $fusion_idx, $var_id_idx ) = ( 0, 0, 0 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
890 while( my $line = $in_maf_fh->getline ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
891 chomp( $line );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
892 if( $line =~ m/^#/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
893 $out_maf_fh->print( "$line\n" ); # Copy comments unchanged
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
894 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
895 elsif( $line =~ m/^Hugo_Symbol/ ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
896 # Copy the header unchanged, after figuring out necessary column indexes
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
897 foreach( split( /\t/, $line )) { last if( $_ eq "Tumor_Sample_Barcode" ); ++$tid_idx; }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
898 foreach( split( /\t/, $line )) { last if( $_ eq "Fusion" ); ++$fusion_idx; }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
899 foreach( split( /\t/, $line )) { last if( $_ eq "variant_id" ); ++$var_id_idx; }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
900 $out_maf_fh->print( "$line\n" ); # Copy header unchanged
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
901 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
902 else {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
903 # Write the gene-pair name into the Fusion column if it was backfilled earlier
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
904 my @cols = split( /\t/, $line, -1 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
905 my $sv_key = $cols[$var_id_idx] . "-" . $cols[$tid_idx];
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
906 $cols[$fusion_idx] = $sv_pair{$sv_key} if( $sv_pair{$sv_key} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
907 $out_maf_fh->print( join( "\t", @cols ) . "\n" );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
908 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
909 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
910 $out_maf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
911 $in_maf_fh->close;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
912
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
913 move( $tmp_output_maf, $output_maf );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
914 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
915
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
916 # Converts Sequence Ontology variant types to MAF variant classifications
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
917 sub GetVariantClassification {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
918 my ( $effect, $var_type, $inframe ) = @_;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
919 return "Splice_Site" if( $effect =~ /^(splice_acceptor_variant|splice_donor_variant|transcript_ablation|exon_loss_variant)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
920 return "Nonsense_Mutation" if( $effect eq 'stop_gained' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
921 return "Frame_Shift_Del" if(( $effect eq 'frameshift_variant' or ( $effect eq 'protein_altering_variant' and !$inframe )) and $var_type eq 'DEL' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
922 return "Frame_Shift_Ins" if(( $effect eq 'frameshift_variant' or ( $effect eq 'protein_altering_variant' and !$inframe )) and $var_type eq 'INS' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
923 return "Nonstop_Mutation" if( $effect eq 'stop_lost' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
924 return "Translation_Start_Site" if( $effect =~ /^(initiator_codon_variant|start_lost)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
925 return "In_Frame_Ins" if( $effect =~ /^(inframe_insertion|disruptive_inframe_insertion)$/ or ( $effect eq 'protein_altering_variant' and $inframe and $var_type eq 'INS' ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
926 return "In_Frame_Del" if( $effect =~ /^(inframe_deletion|disruptive_inframe_deletion)$/ or ( $effect eq 'protein_altering_variant' and $inframe and $var_type eq 'DEL' ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
927 return "Missense_Mutation" if( $effect =~ /^(missense_variant|coding_sequence_variant|conservative_missense_variant|rare_amino_acid_variant)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
928 return "Intron" if ( $effect =~ /^(transcript_amplification|intron_variant|INTRAGENIC|intragenic_variant)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
929 return "Splice_Region" if( $effect eq 'splice_region_variant' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
930 return "Silent" if( $effect =~ /^(incomplete_terminal_codon_variant|synonymous_variant|stop_retained_variant|NMD_transcript_variant)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
931 return "RNA" if( $effect =~ /^(mature_miRNA_variant|exon_variant|non_coding_exon_variant|non_coding_transcript_exon_variant|non_coding_transcript_variant|nc_transcript_variant)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
932 return "5'UTR" if( $effect =~ /^(5_prime_UTR_variant|5_prime_UTR_premature_start_codon_gain_variant)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
933 return "3'UTR" if( $effect eq '3_prime_UTR_variant' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
934 return "IGR" if( $effect =~ /^(TF_binding_site_variant|regulatory_region_variant|regulatory_region|intergenic_variant|intergenic_region)$/ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
935 return "5'Flank" if( $effect eq 'upstream_gene_variant' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
936 return "3'Flank" if ( $effect eq 'downstream_gene_variant' );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
937
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
938 # Annotate everything else simply as a targeted region
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
939 # TFBS_ablation, TFBS_amplification,regulatory_region_ablation, regulatory_region_amplification,
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
940 # feature_elongation, feature_truncation
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
941 return "Targeted_Region";
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
942 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
943
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
944 # Fix the AD and DP fields, given data from a FORMATted genotype string
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
945 sub FixAlleleDepths {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
946 my ( $alleles_ref, $var_allele_idx, $fmt_info_ref ) = @_;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
947 my %fmt_info = %{$fmt_info_ref};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
948 my @alleles = @{$alleles_ref};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
949 my @depths = ();
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
950
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
951 # If AD is defined, then parse out all REF/ALT allele depths, or whatever is in it
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
952 if( defined $fmt_info{AD} and $fmt_info{AD} ne "." ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
953 @depths = map{( m/^\d+$/ ? $_ : "" )}split( /,/, $fmt_info{AD} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
954 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
955
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
956 # Handle VarScan VCF lines where AD contains only 1 depth, and REF allele depth is in RD
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
957 if( scalar( @depths ) == 1 and defined $fmt_info{RD} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
958 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
959 $depths[0] = $fmt_info{RD};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
960 $depths[$var_allele_idx] = $fmt_info{AD};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
961 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
962 # Handle SomaticSniper VCF lines, where allele depths must be extracted from BCOUNT
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
963 elsif( !defined $fmt_info{AD} and defined $fmt_info{BCOUNT} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
964 my %b_idx = ( A=>0, C=>1, G=>2, T=>3 );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
965 my @bcount = split( /,/, $fmt_info{BCOUNT} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
966 @depths = map{(( defined $b_idx{$_} and defined $bcount[$b_idx{$_}] ) ? $bcount[$b_idx{$_}] : "" )} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
967 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
968 # Handle VCF SNV lines by Strelka, where allele depths are in AU:CU:GU:TU
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
969 elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/AU CU GU TU/ ) == 4 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
970 # Strelka allele depths come in tiers 1,2. We'll use tier1 cuz it's stricter, and DP already is
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
971 map{( $fmt_info{$_.'U'} ) = split( ",", $fmt_info{$_.'U'} )} qw( A C G T );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
972
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
973 # If the only ALT allele is N, then set it to the allele with the highest non-ref readcount
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
974 if( scalar( @alleles ) == 2 and $alleles[1] eq "N" ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
975 my %acgt_depths = map{( defined $fmt_info{$_.'U'} ? ( $_, $fmt_info{$_.'U'} ) : ( $_, "" ))} qw( A C G T );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
976 my @deepest = sort {$acgt_depths{$b} <=> $acgt_depths{$a}} keys %acgt_depths;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
977 ( $alleles[1] ) = ( $deepest[0] ne $alleles[0] ? $deepest[0] : $deepest[1] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
978 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
979 @depths = map{( defined $fmt_info{$_.'U'} ? $fmt_info{$_.'U'} : "" )} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
980 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
981 # Handle VCF Indel lines by Strelka, where variant allele depth is in TIR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
982 elsif( !defined $fmt_info{AD} and $fmt_info{TIR} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
983 # Reference allele depth is not provided by Strelka for indels, so we have to skip it
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
984 @depths = ( "", ( split /,/, $fmt_info{TIR} )[0] );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
985 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
986 # Handle VCF lines by CaVEMan, where allele depths are in FAZ:FCZ:FGZ:FTZ:RAZ:RCZ:RGZ:RTZ
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
987 elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/FAZ FCZ FGZ FTZ RAZ RCZ RGZ RTZ/ ) == 8 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
988 # Create tags for forward+reverse strand reads, and use those to determine REF/ALT depths
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
989 map{ $fmt_info{$_} = $fmt_info{'F'.$_} + $fmt_info{'R'.$_} } qw( AZ CZ GZ TZ );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
990 @depths = map{( defined $fmt_info{$_.'Z'} ? $fmt_info{$_.'Z'} : "" )} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
991 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
992 # Handle VCF lines from the Ion Torrent Suite where ALT depths are in AO and REF depths are in RO
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
993 elsif( !defined $fmt_info{AD} and defined $fmt_info{AO} and defined $fmt_info{RO} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
994 @depths = ( $fmt_info{RO}, map{( m/^\d+$/ ? $_ : "" )}split( /,/, $fmt_info{AO} ));
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
995 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
996 # Handle VCF lines from Delly where REF/ALT SV junction read counts are in RR/RV respectively
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
997 elsif( !defined $fmt_info{AD} and defined $fmt_info{RR} and defined $fmt_info{RV} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
998 # Reference allele depth and depths for any other ALT alleles must be left undefined
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
999 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1000 $depths[0] = $fmt_info{RR};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1001 $depths[$var_allele_idx] = $fmt_info{RV};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1002 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1003 # Handle VCF lines from cgpPindel, where ALT depth and total depth are in PP:NP:PR:NR
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1004 elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/PP NP PR NR/ ) == 4 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1005 # Reference allele depth and depths for any other ALT alleles must be left undefined
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1006 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1007 $depths[$var_allele_idx] = $fmt_info{PP} + $fmt_info{NP};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1008 $fmt_info{DP} = $fmt_info{PR} + $fmt_info{NR};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1009 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1010 # Handle VCF lines with ALT allele fraction in FA, which needs to be multiplied by DP to get AD
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1011 elsif( !defined $fmt_info{AD} and defined $fmt_info{FA} and defined $fmt_info{DP} and $fmt_info{DP} ne '.' ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1012 # Reference allele depth and depths for any other ALT alleles must be left undefined
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1013 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1014 $depths[$var_allele_idx] = sprintf( "%.0f", $fmt_info{FA} * $fmt_info{DP} );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1015 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1016 # Handle VCF lines from mpileup/bcftools where DV contains the ALT allele depth
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1017 elsif( !defined $fmt_info{AD} and defined $fmt_info{DV} and defined $fmt_info{DP} ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1018 # Reference allele depth and depths for any other ALT alleles must be left undefined
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1019 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1020 $depths[$var_allele_idx] = $fmt_info{DV};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1021 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1022 # Handle VCF lines where AD contains only 1 value, that we can assume is the variant allele
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1023 elsif( defined $fmt_info{AD} and @depths and scalar( @depths ) == 1 ) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1024 # Reference allele depth and depths for any other ALT alleles must be left undefined
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1025 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1026 $depths[$var_allele_idx] = $fmt_info{AD};
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1027 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1028 # For all other lines where #depths is not equal to #alleles, blank out the depths
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1029 elsif( @depths and scalar( @depths ) ne scalar( @alleles )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1030 @depths = map{""} @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1031 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1032
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1033 # Sanity check that REF/ALT allele depths are lower than the total depth
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1034 if( defined $fmt_info{DP} and $fmt_info{DP} ne '.' and (( $depths[0] and $depths[0] > $fmt_info{DP} ) or
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1035 ( $depths[$var_allele_idx] and $depths[$var_allele_idx] > $fmt_info{DP} ) or
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1036 ( $depths[0] and $depths[$var_allele_idx] and $depths[0] + $depths[$var_allele_idx] > $fmt_info{DP} ))) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1037 $fmt_info{DP} = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1038 map{$fmt_info{DP} += $_ if($_ and $_ ne '.')} @depths;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1039 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1040
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1041 # If we have REF/ALT allele depths, but no DP, then set DP equal to the sum of all ADs
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1042 if(( defined $depths[0] and defined $depths[$var_allele_idx] ) and ( !defined $fmt_info{DP} or $fmt_info{DP} eq '.' )) {
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1043 $fmt_info{DP} = 0;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1044 map{$fmt_info{DP} += $_ if($_ and $_ ne '.')} @depths;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1045 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1046
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1047 # Put all our changes back into the hash/array references that were passed over
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1048 $fmt_info{AD} = join( ",", map{( $_ ne "" ? $_ : "." )} @depths );
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1049 %{$fmt_info_ref} = %fmt_info;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1050 @{$alleles_ref} = @alleles;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1051
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1052 return 1;
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1053 }
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1054
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1055 __DATA__
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1056
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1057 =head1 NAME
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1058
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1059 vcf2maf.pl - Convert a VCF into a MAF by mapping each variant to only one of all possible gene isoforms
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1060
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1061 =head1 SYNOPSIS
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1062
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1063 perl vcf2maf.pl --help
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1064 perl vcf2maf.pl --input-vcf WD4086.vcf --output-maf WD4086.maf --tumor-id WD4086 --normal-id NB4086
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1065
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1066 =head1 OPTIONS
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1067
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1068 --input-vcf Path to input file in VCF format
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1069 --output-maf Path to output MAF file
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1070 --tmp-dir Folder to retain intermediate VCFs after runtime [Default: Folder containing input VCF]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1071 --tumor-id Tumor_Sample_Barcode to report in the MAF [TUMOR]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1072 --normal-id Matched_Norm_Sample_Barcode to report in the MAF [NORMAL]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1073 --vcf-tumor-id Tumor sample ID used in VCF's genotype columns [--tumor-id]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1074 --vcf-normal-id Matched normal ID used in VCF's genotype columns [--normal-id]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1075 --custom-enst List of custom ENST IDs that override canonical selection
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1076 --vep-path Folder containing the vep script [~/vep]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1077 --vep-data VEP's base cache/plugin directory [~/.vep]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1078 --vep-forks Number of forked processes to use when running VEP [4]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1079 --buffer-size Number of variants VEP loads at a time; Reduce this for low memory systems [5000]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1080 --any-allele When reporting co-located variants, allow mismatched variant alleles too
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1081 --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/91_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1082 --filter-vcf A VCF for FILTER tag common_variant. Set to 0 to disable [~/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1083 --max-filter-ac Use tag common_variant if the filter-vcf reports a subpopulation AC higher than this [10]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1084 --species Ensembl-friendly name of species (e.g. mus_musculus for mouse) [homo_sapiens]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1085 --ncbi-build NCBI reference assembly of variants MAF (e.g. GRCm38 for mouse) [GRCh37]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1086 --cache-version Version of offline cache to use with VEP (e.g. 75, 84, 91) [Default: Installed version]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1087 --maf-center Variant calling center to report in MAF [.]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1088 --retain-info Comma-delimited names of INFO fields to retain as extra columns in MAF []
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1089 --min-hom-vaf If GT undefined in VCF, minimum allele fraction to call a variant homozygous [0.7]
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1090 --remap-chain Chain file to remap variants to a different assembly before running VEP
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1091 --help Print a brief help message and quit
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1092 --man Print the detailed manual
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1093
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1094 =head1 DESCRIPTION
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1095
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1096 To convert a VCF into a MAF, each variant must be mapped to only one of all possible gene transcripts/isoforms that it might affect. This selection of a single effect per variant, is often subjective. So this project is an attempt to make the selection criteria smarter, reproducible, and more configurable.
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1097
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1098 This script needs VEP, a variant annotator that maps effects of a variant on all possible genes and transcripts. For more info, see the README.
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1099
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1100 =head2 Relevant links:
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1101
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1102 Homepage: https://github.com/ckandoth/vcf2maf
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1103 VCF format: http://samtools.github.io/hts-specs/
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1104 MAF format: https://wiki.nci.nih.gov/x/eJaPAQ
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1105 VEP: http://ensembl.org/info/docs/tools/vep/index.html
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1106 VEP annotated VCF format: http://ensembl.org/info/docs/tools/vep/vep_formats.html#vcfout
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1107
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1108 =head1 AUTHORS
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1109
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1110 Cyriac Kandoth (ckandoth@gmail.com)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1111 Shweta Chavan (chavan.shweta@gmail.com)
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1112
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1113 =head1 LICENSE
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1114
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1115 Apache-2.0 | Apache License, Version 2.0 | https://www.apache.org/licenses/LICENSE-2.0
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1116
786c9295d2be Uploaded
elixir-it
parents:
diff changeset
1117 =cut