|
0
|
1 #!/usr/bin/env perl
|
|
|
2
|
|
|
3 # vcf2maf - Convert a VCF into a MAF by mapping each variant to only one of all possible gene isoforms
|
|
|
4
|
|
|
5 use strict;
|
|
|
6 use warnings;
|
|
|
7 use IO::File;
|
|
|
8 use Getopt::Long qw( GetOptions );
|
|
|
9 use Pod::Usage qw( pod2usage );
|
|
|
10 use File::Copy qw( move );
|
|
|
11 use File::Path qw( mkpath );
|
|
|
12 use Config;
|
|
|
13
|
|
|
14 # Set any default paths and constants
|
|
|
15 my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" );
|
|
|
16 my ( $vep_path, $vep_data, $vep_forks, $buffer_size, $any_allele ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, 5000, 0 );
|
|
|
17 my ( $ref_fasta, $filter_vcf ) = ( "$ENV{HOME}/.vep/homo_sapiens/91_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz", "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" );
|
|
|
18 my ( $species, $ncbi_build, $cache_version, $maf_center, $retain_info, $min_hom_vaf, $max_filter_ac ) = ( "homo_sapiens", "GRCh37", "", ".", "", 0.7, 10 );
|
|
|
19 my $perl_bin = $Config{perlpath};
|
|
|
20
|
|
|
21 # Find out if samtools and tabix are properly installed, and warn the user if it's not
|
|
|
22 my ( $samtools ) = map{chomp; $_}`which samtools`;
|
|
|
23 ( $samtools and -e $samtools ) or die "ERROR: Please install samtools, and make sure it's in your PATH\n";
|
|
|
24 my ( $tabix ) = map{chomp; $_}`which tabix`;
|
|
|
25 ( $tabix and -e $tabix ) or die "ERROR: Please install tabix, and make sure it's in your PATH\n";
|
|
|
26
|
|
|
27 # Hash to convert 3-letter amino-acid codes to their 1-letter codes
|
|
|
28 my %aa3to1 = qw( Ala A Arg R Asn N Asp D Asx B Cys C Glu E Gln Q Glx Z Gly G His H Ile I Leu L
|
|
|
29 Lys K Met M Phe F Pro P Ser S Thr T Trp W Tyr Y Val V Xxx X Ter * );
|
|
|
30
|
|
|
31 # Prioritize Sequence Ontology terms in order of severity, as estimated by Ensembl:
|
|
|
32 # http://useast.ensembl.org/info/genome/variation/predicted_data.html#consequences
|
|
|
33 sub GetEffectPriority {
|
|
|
34 my ( $effect ) = @_;
|
|
|
35 $effect = '' unless( defined $effect );
|
|
|
36 my %effectPriority = (
|
|
|
37 'transcript_ablation' => 1, # A feature ablation whereby the deleted region includes a transcript feature
|
|
|
38 'exon_loss_variant' => 1, # A sequence variant whereby an exon is lost from the transcript
|
|
|
39 'splice_donor_variant' => 2, # A splice variant that changes the 2 base region at the 5' end of an intron
|
|
|
40 'splice_acceptor_variant' => 2, # A splice variant that changes the 2 base region at the 3' end of an intron
|
|
|
41 'stop_gained' => 3, # A sequence variant whereby at least one base of a codon is changed, resulting in a premature stop codon, leading to a shortened transcript
|
|
|
42 'frameshift_variant' => 3, # A sequence variant which causes a disruption of the translational reading frame, because the number of nucleotides inserted or deleted is not a multiple of three
|
|
|
43 'stop_lost' => 3, # A sequence variant where at least one base of the terminator codon (stop) is changed, resulting in an elongated transcript
|
|
|
44 'start_lost' => 4, # A codon variant that changes at least one base of the canonical start codon
|
|
|
45 'initiator_codon_variant' => 4, # A codon variant that changes at least one base of the first codon of a transcript
|
|
|
46 'disruptive_inframe_insertion' => 5, # An inframe increase in cds length that inserts one or more codons into the coding sequence within an existing codon
|
|
|
47 'disruptive_inframe_deletion' => 5, # An inframe decrease in cds length that deletes bases from the coding sequence starting within an existing codon
|
|
|
48 'inframe_insertion' => 5, # An inframe non synonymous variant that inserts bases into the coding sequence
|
|
|
49 'inframe_deletion' => 5, # An inframe non synonymous variant that deletes bases from the coding sequence
|
|
|
50 'protein_altering_variant' => 5, # A sequence variant which is predicted to change the protein encoded in the coding sequence
|
|
|
51 'missense_variant' => 6, # A sequence variant, that changes one or more bases, resulting in a different amino acid sequence but where the length is preserved
|
|
|
52 'conservative_missense_variant' => 6, # A sequence variant whereby at least one base of a codon is changed resulting in a codon that encodes for a different but similar amino acid. These variants may or may not be deleterious
|
|
|
53 'rare_amino_acid_variant' => 6, # A sequence variant whereby at least one base of a codon encoding a rare amino acid is changed, resulting in a different encoded amino acid
|
|
|
54 'transcript_amplification' => 7, # A feature amplification of a region containing a transcript
|
|
|
55 'splice_region_variant' => 8, # A sequence variant in which a change has occurred within the region of the splice site, either within 1-3 bases of the exon or 3-8 bases of the intron
|
|
|
56 'stop_retained_variant' => 9, # A sequence variant where at least one base in the terminator codon is changed, but the terminator remains
|
|
|
57 'synonymous_variant' => 9, # A sequence variant where there is no resulting change to the encoded amino acid
|
|
|
58 'incomplete_terminal_codon_variant' => 10, # A sequence variant where at least one base of the final codon of an incompletely annotated transcript is changed
|
|
|
59 'coding_sequence_variant' => 11, # A sequence variant that changes the coding sequence
|
|
|
60 'mature_miRNA_variant' => 11, # A transcript variant located with the sequence of the mature miRNA
|
|
|
61 'exon_variant' => 11, # A sequence variant that changes exon sequence
|
|
|
62 '5_prime_UTR_variant' => 12, # A UTR variant of the 5' UTR
|
|
|
63 '5_prime_UTR_premature_start_codon_gain_variant' => 12, # snpEff-specific effect, creating a start codon in 5' UTR
|
|
|
64 '3_prime_UTR_variant' => 12, # A UTR variant of the 3' UTR
|
|
|
65 'non_coding_exon_variant' => 13, # A sequence variant that changes non-coding exon sequence
|
|
|
66 'non_coding_transcript_exon_variant' => 13, # snpEff-specific synonym for non_coding_exon_variant
|
|
|
67 'non_coding_transcript_variant' => 14, # A transcript variant of a non coding RNA gene
|
|
|
68 'nc_transcript_variant' => 14, # A transcript variant of a non coding RNA gene (older alias for non_coding_transcript_variant)
|
|
|
69 'intron_variant' => 14, # A transcript variant occurring within an intron
|
|
|
70 'intragenic_variant' => 14, # A variant that occurs within a gene but falls outside of all transcript features. This occurs when alternate transcripts of a gene do not share overlapping sequence
|
|
|
71 'INTRAGENIC' => 14, # snpEff-specific synonym of intragenic_variant
|
|
|
72 'NMD_transcript_variant' => 15, # A variant in a transcript that is the target of NMD
|
|
|
73 'upstream_gene_variant' => 16, # A sequence variant located 5' of a gene
|
|
|
74 'downstream_gene_variant' => 16, # A sequence variant located 3' of a gene
|
|
|
75 'TFBS_ablation' => 17, # A feature ablation whereby the deleted region includes a transcription factor binding site
|
|
|
76 'TFBS_amplification' => 17, # A feature amplification of a region containing a transcription factor binding site
|
|
|
77 'TF_binding_site_variant' => 17, # A sequence variant located within a transcription factor binding site
|
|
|
78 'regulatory_region_ablation' => 17, # A feature ablation whereby the deleted region includes a regulatory region
|
|
|
79 'regulatory_region_amplification' => 17, # A feature amplification of a region containing a regulatory region
|
|
|
80 'regulatory_region_variant' => 17, # A sequence variant located within a regulatory region
|
|
|
81 'regulatory_region' =>17, # snpEff-specific effect that should really be regulatory_region_variant
|
|
|
82 'feature_elongation' => 18, # A sequence variant that causes the extension of a genomic feature, with regard to the reference sequence
|
|
|
83 'feature_truncation' => 18, # A sequence variant that causes the reduction of a genomic feature, with regard to the reference sequence
|
|
|
84 'intergenic_variant' => 19, # A sequence variant located in the intergenic region, between genes
|
|
|
85 'intergenic_region' => 19, # snpEff-specific effect that should really be intergenic_variant
|
|
|
86 '' => 20
|
|
|
87 );
|
|
|
88 unless( defined $effectPriority{$effect} ) {
|
|
|
89 warn "WARNING: Unrecognized effect \"$effect\". Assigning lowest priority!\n";
|
|
|
90 return 20;
|
|
|
91 }
|
|
|
92 return $effectPriority{$effect};
|
|
|
93 }
|
|
|
94
|
|
|
95 # Prioritize the transcript biotypes that variants are annotated to, based on disease significance:
|
|
|
96 # All possible biotypes are defined here: http://www.gencodegenes.org/gencode_biotypes.html
|
|
|
97 sub GetBiotypePriority {
|
|
|
98 my ( $biotype ) = @_;
|
|
|
99 $biotype = '' unless( defined $biotype );
|
|
|
100 my %biotype_priority = (
|
|
|
101 'protein_coding' => 1, # Contains an open reading frame (ORF)
|
|
|
102 'LRG_gene' => 2, # Gene in a "Locus Reference Genomic" region known to have disease-related sequence variations
|
|
|
103 'IG_C_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
|
|
|
104 'IG_D_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
|
|
|
105 'IG_J_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
|
|
|
106 'IG_LV_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
|
|
|
107 'IG_V_gene' => 2, # Immunoglobulin (Ig) variable chain genes imported or annotated according to the IMGT
|
|
|
108 'TR_C_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
|
|
|
109 'TR_D_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
|
|
|
110 'TR_J_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
|
|
|
111 'TR_V_gene' => 2, # T-cell receptor (TcR) genes imported or annotated according to the IMGT
|
|
|
112 'miRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
113 'snRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
114 'snoRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
115 'ribozyme' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
116 'tRNA' => 3, #Added by Y. Boursin
|
|
|
117 'sRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
118 'scaRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
119 'rRNA' => 3, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
120 'lincRNA' => 3, # Long, intervening noncoding (linc) RNAs, that can be found in evolutionarily conserved, intergenic regions
|
|
|
121 'bidirectional_promoter_lncrna' => 3, # A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand
|
|
|
122 'bidirectional_promoter_lncRNA' => 3, # A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand
|
|
|
123 'known_ncrna' => 4,
|
|
|
124 'vaultRNA' => 4, # Short non coding RNA genes that form part of the vault ribonucleoprotein complex
|
|
|
125 'macro_lncRNA' => 4, # unspliced lncRNAs that are several kb in size
|
|
|
126 'Mt_tRNA' => 4, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
127 'Mt_rRNA' => 4, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
128 'antisense' => 5, # Has transcripts that overlap the genomic span (i.e. exon or introns) of a protein-coding locus on the opposite strand
|
|
|
129 'antisense_RNA' => 5, # Alias for antisense (Y. Boursin)
|
|
|
130 'sense_intronic' => 5, # Long non-coding transcript in introns of a coding gene that does not overlap any exons
|
|
|
131 'sense_overlapping' => 5, # Long non-coding transcript that contains a coding gene in its intron on the same strand
|
|
|
132 '3prime_overlapping_ncrna' => 5, # Transcripts where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR
|
|
|
133 '3prime_overlapping_ncRNA' => 5, # Transcripts where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR
|
|
|
134 'misc_RNA' => 5, # Non-coding RNA predicted using sequences from RFAM and miRBase
|
|
|
135 'non_coding' => 5, # Transcript which is known from the literature to not be protein coding
|
|
|
136 'regulatory_region' => 6, # A region of sequence that is involved in the control of a biological process
|
|
|
137 'disrupted_domain' => 6, # Otherwise viable coding region omitted from this alternatively spliced transcript because the splice variation affects a region coding for a protein domain
|
|
|
138 'processed_transcript' => 6, # Doesn't contain an ORF
|
|
|
139 'TEC' => 6, # To be Experimentally Confirmed. This is used for non-spliced EST clusters that have polyA features. This category has been specifically created for the ENCODE project to highlight regions that could indicate the presence of protein coding genes that require experimental validation, either by 5' RACE or RT-PCR to extend the transcripts, or by confirming expression of the putatively-encoded peptide with specific antibodies
|
|
|
140 'TF_binding_site' => 7, # A region of a nucleotide molecule that binds a Transcription Factor or Transcription Factor complex
|
|
|
141 'CTCF_binding_site' =>7, # A transcription factor binding site with consensus sequence CCGCGNGGNGGCAG, bound by CCCTF-binding factor
|
|
|
142 'promoter_flanking_region' => 7, # A region immediately adjacent to a promoter which may or may not contain transcription factor binding sites
|
|
|
143 'enhancer' => 7, # A cis-acting sequence that increases the utilization of (some) eukaryotic promoters, and can function in either orientation and in any location (upstream or downstream) relative to the promoter
|
|
|
144 'promoter' => 7, # A regulatory_region composed of the TSS(s) and binding sites for TF_complexes of the basal transcription machinery
|
|
|
145 'open_chromatin_region' => 7, # A DNA sequence that in the normal state of the chromosome corresponds to an unfolded, un-complexed stretch of double-stranded DNA
|
|
|
146 'retained_intron' => 7, # Alternatively spliced transcript believed to contain intronic sequence relative to other, coding, variants
|
|
|
147 'nonsense_mediated_decay' => 7, # If the coding sequence (following the appropriate reference) of a transcript finishes >50bp from a downstream splice site then it is tagged as NMD. If the variant does not cover the full reference coding sequence then it is annotated as NMD if NMD is unavoidable i.e. no matter what the exon structure of the missing portion is the transcript will be subject to NMD
|
|
|
148 'non_stop_decay' => 7, # Transcripts that have polyA features (including signal) without a prior stop codon in the CDS, i.e. a non-genomic polyA tail attached directly to the CDS without 3' UTR. These transcripts are subject to degradation
|
|
|
149 'ambiguous_orf' => 7, # Transcript believed to be protein coding, but with more than one possible open reading frame
|
|
|
150 'pseudogene' => 8, # Have homology to proteins but generally suffer from a disrupted coding sequence and an active homologous gene can be found at another locus. Sometimes these entries have an intact coding sequence or an open but truncated ORF, in which case there is other evidence used (for example genomic polyA stretches at the 3' end) to classify them as a pseudogene. Can be further classified as one of the following
|
|
|
151 'processed_pseudogene' => 8, # Pseudogene that lack introns and is thought to arise from reverse transcription of mRNA followed by reinsertion of DNA into the genome
|
|
|
152 'polymorphic_pseudogene' => 8, # Pseudogene owing to a SNP/DIP but in other individuals/haplotypes/strains the gene is translated
|
|
|
153 'retrotransposed' => 8, # Pseudogene owing to a reverse transcribed and re-inserted sequence
|
|
|
154 'translated_processed_pseudogene' => 8, # Pseudogenes that have mass spec data suggesting that they are also translated
|
|
|
155 'translated_unprocessed_pseudogene' => 8, # Pseudogenes that have mass spec data suggesting that they are also translated
|
|
|
156 'transcribed_processed_pseudogene' => 8, # Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression
|
|
|
157 'transcribed_unprocessed_pseudogene' => 8, # Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression
|
|
|
158 'transcribed_unitary_pseudogene' => 8, #Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression
|
|
|
159 'unitary_pseudogene' => 8, # A species specific unprocessed pseudogene without a parent gene, as it has an active orthologue in another species
|
|
|
160 'unprocessed_pseudogene' => 8, # Pseudogene that can contain introns since produced by gene duplication
|
|
|
161 'Mt_tRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
162 'tRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
163 'snoRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
164 'snRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
165 'scRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
166 'rRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
167 'misc_RNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
168 'miRNA_pseudogene' => 8, # Non-coding RNAs predicted to be pseudogenes by the Ensembl pipeline
|
|
|
169 'IG_C_pseudogene' => 8, # Inactivated immunoglobulin gene
|
|
|
170 'IG_D_pseudogene' => 8, # Inactivated immunoglobulin gene
|
|
|
171 'IG_J_pseudogene' => 8, # Inactivated immunoglobulin gene
|
|
|
172 'IG_V_pseudogene' => 8, # Inactivated immunoglobulin gene
|
|
|
173 'TR_J_pseudogene' => 8, # Inactivated immunoglobulin gene
|
|
|
174 'TR_V_pseudogene' => 8, # Inactivated immunoglobulin gene
|
|
|
175 'artifact' => 9, # Used to tag mistakes in the public databases (Ensembl/SwissProt/Trembl)
|
|
|
176 '' => 10
|
|
|
177 );
|
|
|
178 unless( defined $biotype_priority{$biotype} ) {
|
|
|
179 warn "WARNING: Unrecognized biotype \"$biotype\". Assigning lowest priority!\n";
|
|
|
180 return 10;
|
|
|
181 }
|
|
|
182 return $biotype_priority{$biotype};
|
|
|
183 }
|
|
|
184
|
|
|
185 # Check for missing or crappy arguments
|
|
|
186 unless( @ARGV and $ARGV[0] =~ m/^-/ ) {
|
|
|
187 pod2usage( -verbose => 0, -message => "$0: Missing or invalid arguments!\n", -exitval => 2 );
|
|
|
188 }
|
|
|
189
|
|
|
190 # Parse options and print usage if there is a syntax error, or if usage was explicitly requested
|
|
|
191 my ( $man, $help ) = ( 0, 0 );
|
|
|
192 my ( $input_vcf, $output_maf, $tmp_dir, $custom_enst_file );
|
|
|
193 my ( $vcf_tumor_id, $vcf_normal_id, $remap_chain );
|
|
|
194 GetOptions(
|
|
|
195 'help!' => \$help,
|
|
|
196 'man!' => \$man,
|
|
|
197 'input-vcf=s' => \$input_vcf,
|
|
|
198 'output-maf=s' => \$output_maf,
|
|
|
199 'tmp-dir=s' => \$tmp_dir,
|
|
|
200 'tumor-id=s' => \$tumor_id,
|
|
|
201 'normal-id=s' => \$normal_id,
|
|
|
202 'vcf-tumor-id=s' => \$vcf_tumor_id,
|
|
|
203 'vcf-normal-id=s' => \$vcf_normal_id,
|
|
|
204 'custom-enst=s' => \$custom_enst_file,
|
|
|
205 'vep-path=s' => \$vep_path,
|
|
|
206 'vep-data=s' => \$vep_data,
|
|
|
207 'vep-forks=s' => \$vep_forks,
|
|
|
208 'buffer-size=i' => \$buffer_size,
|
|
|
209 'any-allele!' => \$any_allele,
|
|
|
210 'ref-fasta=s' => \$ref_fasta,
|
|
|
211 'species=s' => \$species,
|
|
|
212 'ncbi-build=s' => \$ncbi_build,
|
|
|
213 'cache-version=s' => \$cache_version,
|
|
|
214 'maf-center=s' => \$maf_center,
|
|
|
215 'retain-info=s' => \$retain_info,
|
|
|
216 'min-hom-vaf=s' => \$min_hom_vaf,
|
|
|
217 'remap-chain=s' => \$remap_chain,
|
|
|
218 'filter-vcf=s' => \$filter_vcf,
|
|
|
219 'max-filter-ac=i' => \$max_filter_ac
|
|
|
220 ) or pod2usage( -verbose => 1, -input => \*DATA, -exitval => 2 );
|
|
|
221 pod2usage( -verbose => 1, -input => \*DATA, -exitval => 0 ) if( $help );
|
|
|
222 pod2usage( -verbose => 2, -input => \*DATA, -exitval => 0 ) if( $man );
|
|
|
223
|
|
|
224 # Check if required arguments are missing or problematic
|
|
|
225 ( defined $input_vcf and defined $output_maf ) or die "ERROR: Both input-vcf and output-maf must be defined!\n";
|
|
|
226 ( -s $input_vcf ) or die "ERROR: Provided --input-vcf is missing or empty: $input_vcf\n";
|
|
|
227 ( -s $ref_fasta ) or die "ERROR: Provided --ref-fasta is missing or empty: $ref_fasta\n";
|
|
|
228 ( $input_vcf !~ m/\.(gz|bz2|bcf)$/ ) or die "ERROR: Unfortunately, --input-vcf cannot be in a compressed format\n";
|
|
|
229
|
|
|
230 # Unless specified, assume that the VCF uses the same sample IDs that the MAF will contain
|
|
|
231 $vcf_tumor_id = $tumor_id unless( $vcf_tumor_id );
|
|
|
232 $vcf_normal_id = $normal_id unless( $vcf_normal_id );
|
|
|
233
|
|
|
234 # Load up the custom isoform overrides if provided:
|
|
|
235 my %custom_enst;
|
|
|
236 if( $custom_enst_file ) {
|
|
|
237 ( -s $custom_enst_file ) or die "ERROR: Provided --custom-enst file is missing or empty: $custom_enst_file\n";
|
|
|
238 %custom_enst = map{chomp; ( $_, 1 )}`grep -v ^# $custom_enst_file | cut -f1`;
|
|
|
239 }
|
|
|
240
|
|
|
241 # Create a folder for the intermediate VCFs if user-defined, or default to the input VCF's folder
|
|
|
242 if( defined $tmp_dir ) {
|
|
|
243 mkpath( $tmp_dir ) unless( -d $tmp_dir );
|
|
|
244 }
|
|
|
245 else {
|
|
|
246 $tmp_dir = substr( $input_vcf, 0, rindex( $input_vcf, "/" )) if( $input_vcf =~ m/\// );
|
|
|
247 $tmp_dir = "." unless( $tmp_dir ); # In case the input VCF is in the current working directory
|
|
|
248 }
|
|
|
249
|
|
|
250 # Also figure out the base name of the input VCF, cuz we'll be naming a lot of files based on that
|
|
|
251 my $input_name = substr( $input_vcf, rindex( $input_vcf, "/" ) + 1 );
|
|
|
252 $input_name =~ s/(\.vcf)*$//;
|
|
|
253
|
|
|
254 # If the VCF contains SVs, split the breakpoints into separate lines before passing to VEP
|
|
|
255 my $split_svs = 0;
|
|
|
256 my $orig_vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open --input-vcf: $input_vcf!\n";
|
|
|
257 my $split_vcf_fh = IO::File->new( "$tmp_dir/$input_name.split.vcf", "w" ) or die "ERROR: Couldn't open VCF: $tmp_dir/$input_name.split.vcf!\n";
|
|
|
258 while( my $line = $orig_vcf_fh->getline ) {
|
|
|
259 # If the file uses Mac OS 9 newlines, quit with an error
|
|
|
260 ( $line !~ m/\r$/ ) or die "ERROR: Your VCF uses CR line breaks, which we can't support. Please use LF or CRLF.\n";
|
|
|
261
|
|
|
262 if( $line =~ m/^#/ ) {
|
|
|
263 $split_vcf_fh->print( $line ); # Write header lines unchanged
|
|
|
264 next;
|
|
|
265 }
|
|
|
266
|
|
|
267 chomp( $line );
|
|
|
268 my @cols = split( "\t", $line );
|
|
|
269 my %info = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, "1" ))} split( /\;/, $cols[7] );
|
|
|
270 if( $info{SVTYPE} ){
|
|
|
271 # Remove SVTYPE tag if REF/ALT alleles are defined, or VEP won't report transcript effects
|
|
|
272 if( $cols[3]=~m/^[ACGTN]+$/i and $cols[4]=~m/^[ACGTN,]+$/i ) {
|
|
|
273 $cols[7]=~s/(SVTYPE=\w+;|;SVTYPE=\w+|SVTYPE=\w+)//;
|
|
|
274 $split_vcf_fh->print( join( "\t", @cols ), "\n" );
|
|
|
275 }
|
|
|
276 # For legit SVs except insertions, split them into two separate breakpoint events
|
|
|
277 elsif( $info{SVTYPE}=~m/^(BND|TRA|DEL|DUP|INV)$/ ) {
|
|
|
278 $split_svs = 1;
|
|
|
279 # Don't tell VEP it's an SV, by removing the SVTYPE tag
|
|
|
280 $cols[7]=~s/(SVTYPE=\w+;|;SVTYPE=\w+|SVTYPE=\w+)//;
|
|
|
281 # Rename two SV specific INFO keys to something friendlier
|
|
|
282 $cols[7]=~s/CT=([35]to[35])/Frame=$1/;
|
|
|
283 $cols[7]=~s/SVMETHOD=([\w.]+)/Method=$1/;
|
|
|
284 $cols[4] = "<" . $info{SVTYPE} . ">";
|
|
|
285 # Fetch the REF allele at the second breakpoint using samtools faidx
|
|
|
286 my $ref2 = `$samtools faidx $ref_fasta $info{CHR2}:$info{END}-$info{END} | grep -v ^\\>`;
|
|
|
287 chomp( $ref2 );
|
|
|
288 $split_vcf_fh->print( join( "\t", $info{CHR2}, $info{END}, $cols[2], ( $ref2 ? $ref2 : $cols[3] ), @cols[4..$#cols] ), "\n" );
|
|
|
289 $split_vcf_fh->print( join( "\t", @cols ), "\n" );
|
|
|
290 }
|
|
|
291 $input_vcf = "$tmp_dir/$input_name.split.vcf";
|
|
|
292 }
|
|
|
293 else {
|
|
|
294 $split_vcf_fh->print( join( "\t", @cols ), "\n" );
|
|
|
295 }
|
|
|
296 }
|
|
|
297 $split_vcf_fh->close;
|
|
|
298 $orig_vcf_fh->close;
|
|
|
299
|
|
|
300 # Delete the split.vcf created above if we didn't find any variants with the SVTYPE tag
|
|
|
301 unlink( "$tmp_dir/$input_name.split.vcf" ) if( $input_vcf ne "$tmp_dir/$input_name.split.vcf" );
|
|
|
302
|
|
|
303 # If a liftOver chain was provided, remap and switch the input VCF before annotation
|
|
|
304 my ( %remap );
|
|
|
305 if( $remap_chain ) {
|
|
|
306 # Find out if liftOver is properly installed, and warn the user if it's not
|
|
|
307 my $liftover = `which liftOver`;
|
|
|
308 chomp( $liftover );
|
|
|
309 ( $liftover and -e $liftover ) or die "ERROR: Please install liftOver, and make sure it's in your PATH\n";
|
|
|
310
|
|
|
311 # Make a BED file from the VCF, run liftOver on it, and create a hash mapping old to new loci
|
|
|
312 `grep -v ^# $input_vcf | cut -f1,2 | awk '{OFS="\\t"; print \$1,\$2-1,\$2,\$1":"\$2}' > $tmp_dir/$input_name.bed`;
|
|
|
313 %remap = map{chomp; my @c=split("\t"); ($c[3], "$c[0]:$c[2]")}`$liftover $tmp_dir/$input_name.bed $remap_chain /dev/stdout /dev/null 2> /dev/null`;
|
|
|
314 unlink( "$tmp_dir/$input_name.bed" );
|
|
|
315
|
|
|
316 # Create a new VCF in the temp folder, with remapped loci on which we'll run annotation
|
|
|
317 my $orig_vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open --input-vcf: $input_vcf!\n";
|
|
|
318 my $remap_vcf_fh = IO::File->new( "$tmp_dir/$input_name.remap.vcf", "w" ) or die "ERROR: Couldn't open VCF: $tmp_dir/$input_name.remap.vcf!\n";
|
|
|
319 while( my $line = $orig_vcf_fh->getline ) {
|
|
|
320 if( $line =~ m/^#/ ) {
|
|
|
321 $remap_vcf_fh->print( $line ); # Write header lines unchanged
|
|
|
322 }
|
|
|
323 else {
|
|
|
324 chomp( $line );
|
|
|
325 my @cols = split( "\t", $line );
|
|
|
326 my $locus = $cols[0] . ":" . $cols[1];
|
|
|
327 if( defined $remap{$locus} ) {
|
|
|
328 # Retain original variant under INFO, so we can append it later to the output MAF
|
|
|
329 $cols[7] = ( !$cols[7] or $cols[7] eq "." ? "" : "$cols[7];" ) . "REMAPPED_POS=" . join( ":", @cols[0,1,3,4] );
|
|
|
330 @cols[0,1] = split( ":", $remap{$locus} );
|
|
|
331 $remap_vcf_fh->print( join( "\t", @cols ), "\n" );
|
|
|
332 }
|
|
|
333 else {
|
|
|
334 warn "WARNING: Skipping variant at $locus; Unable to liftOver using $remap_chain\n";
|
|
|
335 }
|
|
|
336 }
|
|
|
337 }
|
|
|
338 $remap_vcf_fh->close;
|
|
|
339 $orig_vcf_fh->close;
|
|
|
340 $input_vcf = "$tmp_dir/$input_name.remap.vcf";
|
|
|
341 }
|
|
|
342
|
|
|
343 # Before running annotation, let's pull flanking reference bps for each variant to do some checks,
|
|
|
344 # and we'll also pull out overlapping calls from the filter VCF
|
|
|
345 my $vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open --input-vcf: $input_vcf!\n";
|
|
|
346 my ( %ref_bps, @ref_regions, %uniq_loci, %uniq_regions, %flanking_bps, %filter_data );
|
|
|
347 while( my $line = $vcf_fh->getline ) {
|
|
|
348 # Skip header lines, and pull variant loci to pass to samtools later
|
|
|
349 next if( $line =~ m/^#/ );
|
|
|
350 chomp( $line );
|
|
|
351 my ( $chr, $pos, undef, $ref ) = split( "\t", $line );
|
|
|
352 # Create a region that spans the length of the reference allele and 1bp flanks around it
|
|
|
353 my $region = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref ));
|
|
|
354 $ref_bps{$region} = $ref;
|
|
|
355 push( @ref_regions, $region );
|
|
|
356 $uniq_regions{$region} = 1;
|
|
|
357 $uniq_loci{"$chr:$pos-$pos"} = 1;
|
|
|
358 }
|
|
|
359 $vcf_fh->close;
|
|
|
360
|
|
|
361 # samtools runs faster when passed many loci at a time, but limited to around 125k args, at least
|
|
|
362 # on CentOS 6. If there are too many loci, split them into 50k chunks and run separately
|
|
|
363 my ( $lines, @regions_split ) = ( "", ());
|
|
|
364 my @regions = keys %uniq_regions;
|
|
|
365 my $chr_prefix_in_use = ( @regions and $regions[0] =~ m/^chr/ ? 1 : 0 );
|
|
|
366 push( @regions_split, [ splice( @regions, 0, 50000 ) ] ) while @regions;
|
|
|
367 map{ my $region = join( " ", @{$_} ); $lines .= `$samtools faidx $ref_fasta $region` } @regions_split;
|
|
|
368 foreach my $line ( grep( length, split( ">", $lines ))) {
|
|
|
369 # Carefully split this FASTA entry, properly chomping newlines for long indels
|
|
|
370 my ( $region, $bps ) = split( "\n", $line, 2 );
|
|
|
371 $bps =~ s/\r|\n//g;
|
|
|
372 if( $bps ){
|
|
|
373 $bps = uc( $bps );
|
|
|
374 $flanking_bps{$region} = $bps;
|
|
|
375 }
|
|
|
376 }
|
|
|
377
|
|
|
378 # If flanking_bps is entirely empty, then it's most likely that the user chose the wrong ref-fasta
|
|
|
379 # Or it's also possible that an outdated samtools was unable to parse the gzipped FASTA files
|
|
|
380 # ::NOTE:: If input had no variants, don't break here, so we can continue to create an empty MAF
|
|
|
381 ( !@regions or %flanking_bps ) or die "ERROR: You're either using an outdated samtools, or --ref-fasta is not the same genome build as your --input-vcf.";
|
|
|
382
|
|
|
383 # Skip filtering if not handling GRCh37, and filter-vcf is pointing to the default GRCh37 ExAC VCF
|
|
|
384 if(( $species eq "homo_sapiens" and $ncbi_build eq "GRCh37" and $filter_vcf ) or ( $filter_vcf and $filter_vcf ne "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" )) {
|
|
|
385 ( -s $filter_vcf ) or die "ERROR: Provided --filter-vcf is missing or empty: $filter_vcf\n";
|
|
|
386 # Query each variant locus on the filter VCF, using tabix, just like we used samtools earlier
|
|
|
387 ( $lines, @regions_split ) = ( "", ());
|
|
|
388 my @regions = keys %uniq_loci;
|
|
|
389 push( @regions_split, [ splice( @regions, 0, 50000 ) ] ) while @regions;
|
|
|
390 # ::NOTE:: chr-prefix removal works safely here because ExAC is limited to 1..22, X, Y
|
|
|
391 map{ my $loci = join( " ", map{s/^chr//; $_} @{$_} ); $lines .= `$tabix $filter_vcf $loci` } @regions_split;
|
|
|
392 foreach my $line ( split( "\n", $lines )) {
|
|
|
393 my ( $chr, $pos, undef, $ref, $alt, undef, $filter, $info_line ) = split( "\t", $line );
|
|
|
394 # Parse out data from info column, and store it for later, along with REF, ALT, and FILTER
|
|
|
395 my $locus = ( $chr_prefix_in_use ? "chr$chr:$pos" : "$chr:$pos" );
|
|
|
396 %{$filter_data{$locus}} = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, "1" ))} split( /\;/, $info_line );
|
|
|
397 $filter_data{$locus}{REF} = $ref;
|
|
|
398 $filter_data{$locus}{ALT} = $alt;
|
|
|
399 $filter_data{$locus}{FILTER} = $filter;
|
|
|
400 }
|
|
|
401 }
|
|
|
402
|
|
|
403 # For each variant locus and reference allele in the input VCF, report any problems
|
|
|
404 foreach my $region ( @ref_regions ) {
|
|
|
405 my $ref = $ref_bps{$region};
|
|
|
406 my ( $locus ) = map{ my ( $chr, $pos ) = split( ":" ); ++$pos; "$chr:$pos" } split( "-", $region );
|
|
|
407 if( !defined $flanking_bps{$region} ) {
|
|
|
408 warn "WARNING: Couldn't retrieve bps around $locus from reference FASTA: $ref_fasta\n";
|
|
|
409 }
|
|
|
410 elsif( $flanking_bps{$region} !~ m/^[ACGTN]+$/ ) {
|
|
|
411 warn "WARNING: Retrieved invalid bps " . $flanking_bps{$region} . " around $locus from reference FASTA: $ref_fasta\n";
|
|
|
412 }
|
|
|
413 elsif( $ref ne substr( $flanking_bps{$region}, 1, length( $ref ))) {
|
|
|
414 warn "WARNING: Reference allele $ref at $locus doesn't match " .
|
|
|
415 substr( $flanking_bps{$region}, 1, length( $ref )) . " (flanking bps: " .
|
|
|
416 $flanking_bps{$region} . ") from reference FASTA: $ref_fasta\n";
|
|
|
417 }
|
|
|
418 }
|
|
|
419
|
|
|
420 # Annotate variants in given VCF to all possible transcripts
|
|
|
421 my $output_vcf = ( $remap_chain ? "$tmp_dir/$input_name.remap.vep.vcf" : "$tmp_dir/$input_name.vep.vcf" );
|
|
|
422 # Skip running VEP if an annotated VCF already exists
|
|
|
423 unless( -s $output_vcf ) {
|
|
|
424 warn "STATUS: Running VEP and writing to: $output_vcf\n";
|
|
|
425 # Make sure we can find the VEP script
|
|
|
426 my $vep_script = ( -s "$vep_path/vep" ? "$vep_path/vep" : "$vep_path/variant_effect_predictor.pl" );
|
|
|
427 ( -s $vep_script ) or die "ERROR: Cannot find VEP script in path: $vep_path\n";
|
|
|
428
|
|
|
429 # Contruct VEP command using some default options and run it
|
|
|
430 my $vep_cmd = "$perl_bin $vep_script --species $species --assembly $ncbi_build --offline --no_progress --no_stats --buffer_size $buffer_size --sift b --ccds --uniprot --hgvs --symbol --numbers --domains --gene_phenotype --canonical --protein --biotype --uniprot --tsl --pubmed --variant_class --shift_hgvs 1 --check_existing --total_length --allele_number --no_escape --xref_refseq --failed 1 --vcf --flag_pick_allele --pick_order canonical,tsl,biotype,rank,ccds,length --dir $vep_data --fasta $ref_fasta --format vcf --input_file $input_vcf --output_file $output_vcf";
|
|
|
431 # VEP barks if --fork is set to 1. So don't use this argument unless it's >1
|
|
|
432 $vep_cmd .= " --fork $vep_forks" if( $vep_forks > 1 );
|
|
|
433 # Require allele match for co-located variants unless user-rejected or we're using a newer VEP
|
|
|
434 $vep_cmd .= " --check_allele" unless( $any_allele or $vep_script =~ m/vep$/ );
|
|
|
435 # Add --cache-version only if the user specifically asked for a version
|
|
|
436 $vep_cmd .= " --cache_version $cache_version" if( $cache_version );
|
|
|
437 # Add options that only work on human variants
|
|
|
438 if( $species eq "homo_sapiens" ) {
|
|
|
439 # Slight change in these arguments if using the newer VEP
|
|
|
440 $vep_cmd .= " --polyphen b " . ( $vep_script =~ m/vep$/ ? "--af --af_1kg --af_esp --af_gnomad" : "--gmaf --maf_1kg --maf_esp" );
|
|
|
441 }
|
|
|
442 # Add options that work for most species, except a few we know about
|
|
|
443 $vep_cmd .= " --regulatory" unless( $species eq "canis_familiaris" );
|
|
|
444
|
|
|
445 # Make sure it ran without error codes
|
|
|
446 system( $vep_cmd ) == 0 or die "\nERROR: Failed to run the VEP annotator! Command: $vep_cmd\n";
|
|
|
447 ( -s $output_vcf ) or warn "WARNING: VEP-annotated VCF file is missing or empty: $output_vcf\n";
|
|
|
448 }
|
|
|
449
|
|
|
450 # Define default MAF Header (https://wiki.nci.nih.gov/x/eJaPAQ) with our vcf2maf additions
|
|
|
451 my @maf_header = qw(
|
|
|
452 Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand
|
|
|
453 Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
|
|
|
454 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode
|
|
|
455 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2
|
|
|
456 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status
|
|
|
457 Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score
|
|
|
458 BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID
|
|
|
459 Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects
|
|
|
460 );
|
|
|
461
|
|
|
462 # Add extra annotation columns to the MAF in a consistent order
|
|
|
463 my @ann_cols = qw( Allele Gene Feature Feature_type Consequence cDNA_position CDS_position
|
|
|
464 Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE STRAND_VEP SYMBOL
|
|
|
465 SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen
|
|
|
466 EXON INTRON DOMAINS AF AFR_AF AMR_AF ASN_AF EAS_AF EUR_AF SAS_AF AA_AF EA_AF CLIN_SIG SOMATIC
|
|
|
467 PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICK VARIANT_CLASS TSL
|
|
|
468 HGVS_OFFSET PHENO MINIMISED ExAC_AF ExAC_AF_AFR ExAC_AF_AMR ExAC_AF_EAS ExAC_AF_FIN ExAC_AF_NFE
|
|
|
469 ExAC_AF_OTH ExAC_AF_SAS GENE_PHENO FILTER flanking_bps variant_id variant_qual ExAC_AF_Adj
|
|
|
470 ExAC_AC_AN_Adj ExAC_AC_AN ExAC_AC_AN_AFR ExAC_AC_AN_AMR ExAC_AC_AN_EAS ExAC_AC_AN_FIN
|
|
|
471 ExAC_AC_AN_NFE ExAC_AC_AN_OTH ExAC_AC_AN_SAS ExAC_FILTER gnomAD_AF gnomAD_AFR_AF gnomAD_AMR_AF
|
|
|
472 gnomAD_ASJ_AF gnomAD_EAS_AF gnomAD_FIN_AF gnomAD_NFE_AF gnomAD_OTH_AF gnomAD_SAS_AF );
|
|
|
473
|
|
|
474 my @ann_cols_format; # To store the actual order of VEP data, that may differ between runs
|
|
|
475 push( @maf_header, @ann_cols );
|
|
|
476
|
|
|
477 # If the user has INFO fields they want to retain, create additional columns for those
|
|
|
478 my @addl_info_cols = ();
|
|
|
479 if( $retain_info or $remap_chain or $split_svs ) {
|
|
|
480 # But let's not overwrite existing columns with the same name
|
|
|
481 my %maf_cols = map{ my $c = lc; ( $c, 1 )} @maf_header;
|
|
|
482 @addl_info_cols = grep{ my $c = lc; !$maf_cols{$c}} split( ",", $retain_info );
|
|
|
483 # If a remap-chain was used, add a column to retain the original chr:pos:ref:alt
|
|
|
484 push( @addl_info_cols, "REMAPPED_POS" ) if( $remap_chain );
|
|
|
485 # If we had to split some SVs earlier, add some columns with some useful info about SVs
|
|
|
486 push( @addl_info_cols, qw( Fusion Method Frame CONSENSUS )) if( $split_svs );
|
|
|
487 push( @maf_header, @addl_info_cols );
|
|
|
488 }
|
|
|
489
|
|
|
490 # Locate and load the file mapping ENSG IDs to Entrez IDs
|
|
|
491 my ( $script_dir ) = $0 =~ m/^(.*)\/vcf2maf/;
|
|
|
492 $script_dir = "." unless( $script_dir );
|
|
|
493
|
|
|
494 my $entrez_id_file = "$script_dir/data/ensg_to_entrez_id_map_ensembl_feb2014.tsv";
|
|
|
495 my %entrez_id_map = ();
|
|
|
496 if( -s $entrez_id_file ) {
|
|
|
497 %entrez_id_map = map{chomp; split("\t")} `grep -hv ^# $entrez_id_file`;
|
|
|
498 }
|
|
|
499
|
|
|
500 # Parse through each variant in the annotated VCF, pull out CSQ/ANN from the INFO column, and choose
|
|
|
501 # one transcript per variant whose annotation will be used in the MAF
|
|
|
502 my $maf_fh = IO::File->new( $output_maf, ">" ) or die "ERROR: Couldn't open --output-maf: $output_maf!\n";
|
|
|
503 $maf_fh->print( "#version 2.4\n" . join( "\t", @maf_header ), "\n" ); # Print MAF header
|
|
|
504 ( -s $output_vcf ) or exit; # Warnings on this were printed earlier, but quit here, only after a blank MAF is created
|
|
|
505 my $annotated_vcf_fh = IO::File->new( $output_vcf ) or die "ERROR: Couldn't open annotated VCF: $output_vcf!\n";
|
|
|
506 my ( $vcf_tumor_idx, $vcf_normal_idx, %sv_pair );
|
|
|
507 while( my $line = $annotated_vcf_fh->getline ) {
|
|
|
508
|
|
|
509 # Parse out the VEP CSQ/ANN format, which seems to differ between runs
|
|
|
510 if( $line =~ m/^##INFO=<ID=(CSQ|ANN).*Format: (\S+)">$/ ) {
|
|
|
511 # Use this as the expected column order of VEP annotation, unless we already got it from CSQ
|
|
|
512 @ann_cols_format = split( /\|/, $2 ) unless( @ann_cols_format and $1 eq "ANN" );
|
|
|
513 }
|
|
|
514 # Skip all other header lines
|
|
|
515 next if( $line =~ m/^##/ );
|
|
|
516
|
|
|
517 chomp( $line );
|
|
|
518 my ( $chrom, $pos, $var_id, $ref, $alt, $var_qual, $filter, $info_line, $format_line, @rest ) = split( "\t", $line );
|
|
|
519
|
|
|
520 # Set ID, QUAL, and FILTER to "." unless defined and non-empty
|
|
|
521 $var_id = "." unless( defined $var_id and $var_id ne "" );
|
|
|
522 $var_qual = "." unless( defined $var_qual and $var_qual ne "" );
|
|
|
523 $filter = "." unless( defined $filter and $filter ne "" );
|
|
|
524
|
|
|
525 # If FORMATted genotype fields are available, find the sample with the variant, and matched normal
|
|
|
526 if( $line =~ m/^#CHROM/ ) {
|
|
|
527 if( $format_line and scalar( @rest ) > 0 ) {
|
|
|
528 for( my $i = 0; $i <= $#rest; ++$i ) {
|
|
|
529 $vcf_tumor_idx = $i if( $rest[$i] eq $vcf_tumor_id );
|
|
|
530 $vcf_normal_idx = $i if( $rest[$i] eq $vcf_normal_id );
|
|
|
531 }
|
|
|
532 ( defined $vcf_tumor_idx ) or warn "WARNING: No genotype column for $vcf_tumor_id in VCF!\n";
|
|
|
533 ( defined $vcf_normal_idx ) or warn "WARNING: No genotype column for $vcf_normal_id in VCF!\n";
|
|
|
534 }
|
|
|
535 next;
|
|
|
536 }
|
|
|
537
|
|
|
538 # Parse out the data in the info column, and store into a hash
|
|
|
539 my %info = map {( m/=/ ? ( split( /=/, $_, 2 )) : ( $_, "1" ))} split( /\;/, $info_line );
|
|
|
540
|
|
|
541 # By default, the variant allele is the first (usually the only) allele listed under ALT. If
|
|
|
542 # there are >1 alleles in ALT, choose the first non-REF allele listed under tumor GT, that is
|
|
|
543 # also not seen under normal GT. If tumor GT is undefined or ambiguous, choose the tumor allele
|
|
|
544 # with the most supporting read depth, if available.
|
|
|
545 my @alleles = ( $ref, split( /,/, $alt ));
|
|
|
546 my $var_allele_idx = 1;
|
|
|
547
|
|
|
548 # Parse out info from the normal genotype field
|
|
|
549 my ( %nrm_info, @nrm_depths );
|
|
|
550 if( defined $vcf_normal_idx ) {
|
|
|
551 my @format_keys = split( /\:/, $format_line );
|
|
|
552 my $idx = 0;
|
|
|
553 %nrm_info = map {( $format_keys[$idx++], $_ )} split( /\:/, $rest[$vcf_normal_idx] );
|
|
|
554 }
|
|
|
555
|
|
|
556 # Parse out info from the tumor genotype field
|
|
|
557 my ( %tum_info, @tum_depths );
|
|
|
558 if( defined $vcf_tumor_idx ) {
|
|
|
559 my @format_keys = split( /\:/, $format_line );
|
|
|
560 my $idx = 0;
|
|
|
561 %tum_info = map {( $format_keys[$idx++], $_ )} split( /\:/, $rest[$vcf_tumor_idx] );
|
|
|
562
|
|
|
563 # If possible, parse the tumor genotype to identify the variant allele
|
|
|
564 if( defined $tum_info{GT} and $tum_info{GT} ne "." and $tum_info{GT} ne "./." ) {
|
|
|
565 my @tum_gt = split( /[\/|]/, $tum_info{GT} );
|
|
|
566 # Default to the first non-REF allele seen in tumor GT
|
|
|
567 ( $var_allele_idx ) = grep {$_ ne "0"} @tum_gt;
|
|
|
568 # If possible, choose the first non-REF tumor allele that is also not in normal GT
|
|
|
569 if( defined $nrm_info{GT} and $nrm_info{GT} ne "." and $nrm_info{GT} ne "./." ) {
|
|
|
570 my %nrm_gt = map {( $_, 1 )} split( /[\/|]/, $nrm_info{GT} );
|
|
|
571 ( $var_allele_idx ) = grep {$_ ne "0" and !$nrm_gt{$_}} @tum_gt;
|
|
|
572 }
|
|
|
573 # If GT was unhelpful, default to the first ALT allele and set GT to undefined
|
|
|
574 if( !defined $var_allele_idx or $var_allele_idx !~ m/^\d+$/ or $var_allele_idx >= scalar( @alleles )) {
|
|
|
575 $var_allele_idx = 1;
|
|
|
576 $tum_info{GT} = "./.";
|
|
|
577 }
|
|
|
578 }
|
|
|
579
|
|
|
580 # Standardize tumor AD and DP based on data in the genotype fields
|
|
|
581 FixAlleleDepths( \@alleles, $var_allele_idx, \%tum_info );
|
|
|
582 @tum_depths = split( ",", $tum_info{AD} );
|
|
|
583
|
|
|
584 # If genotype is undefined, use the allele depths collected to choose the major variant allele
|
|
|
585 unless( defined $tum_info{GT} and $tum_info{GT} ne '.' and $tum_info{GT} ne "./." ) {
|
|
|
586 # The first depth listed belongs to the reference allele. Of the rest, find the largest
|
|
|
587 for( my $i = 1; $i <= $#tum_depths; ++$i ) {
|
|
|
588 $var_allele_idx = $i if( $tum_depths[$i] and $tum_depths[$i] > $tum_depths[$var_allele_idx] );
|
|
|
589 }
|
|
|
590 $tum_info{GT} = "./.";
|
|
|
591 if( defined $tum_info{DP} and $tum_info{DP} ne '.' and $tum_info{DP} != 0 and defined $tum_depths[$var_allele_idx] ) {
|
|
|
592 my $vaf = $tum_depths[$var_allele_idx] / $tum_info{DP};
|
|
|
593 $tum_info{GT} = ( $vaf < $min_hom_vaf ? "0/1" : "1/1" );
|
|
|
594 }
|
|
|
595 }
|
|
|
596 }
|
|
|
597
|
|
|
598 # Set the variant allele to whatever we selected above
|
|
|
599 my $var = $alleles[$var_allele_idx];
|
|
|
600
|
|
|
601 # Standardize normal AD and DP based on data in the genotype fields
|
|
|
602 if( defined $vcf_normal_idx ) {
|
|
|
603 FixAlleleDepths( \@alleles, $var_allele_idx, \%nrm_info );
|
|
|
604 @nrm_depths = split( ",", $nrm_info{AD} );
|
|
|
605 $nrm_info{GT} = "./." unless( defined $nrm_info{GT} and $nrm_info{GT} ne '.' );
|
|
|
606 }
|
|
|
607
|
|
|
608 # Figure out the appropriate start/stop loci and variant type/allele to report in the MAF
|
|
|
609 my $start = my $stop = my $var_type = my $inframe = "";
|
|
|
610 my ( $ref_length, $var_length ) = ( length( $ref ), length( $var ));
|
|
|
611 # Backup the VCF-style position and REF/ALT alleles, so we can use it later
|
|
|
612 my ( $vcf_pos, $vcf_ref, $vcf_var ) = ( $pos, $ref, $var );
|
|
|
613 # Remove any prefixed reference bps from all alleles, using "-" for simple indels
|
|
|
614 while( $ref and $var and substr( $ref, 0, 1 ) eq substr( $var, 0, 1 ) and $ref ne $var ) {
|
|
|
615 ( $ref, $var, @alleles ) = map{$_ = substr( $_, 1 ); ( $_ ? $_ : "-" )} ( $ref, $var, @alleles );
|
|
|
616 --$ref_length; --$var_length; ++$pos;
|
|
|
617 }
|
|
|
618 # Handle SNPs, DNPs, TNPs, or anything larger (ONP)
|
|
|
619 if( $ref_length == $var_length ) {
|
|
|
620 ( $start, $stop ) = ( $pos, $pos + $var_length - 1 );
|
|
|
621 my %np_type = qw( 1 SNP 2 DNP 3 TNP );
|
|
|
622 $var_type = ( $var_length > 3 ? "ONP" : $np_type{$var_length} );
|
|
|
623 }
|
|
|
624 # Handle all indels, including those complex ones which contain substitutions
|
|
|
625 elsif( $ref_length != $var_length ) {
|
|
|
626 if( $ref_length < $var_length ) { # Handle insertions, and the special case for complex ones
|
|
|
627 ( $start, $stop ) = (( $ref eq "-" ? $pos - 1 : $pos ), ( $ref eq "-" ? $pos : $pos + $ref_length - 1 ));
|
|
|
628 $var_type = "INS";
|
|
|
629 }
|
|
|
630 else { # Handle deletions
|
|
|
631 ( $start, $stop ) = ( $pos, $pos + $ref_length - 1 );
|
|
|
632 $var_type = "DEL";
|
|
|
633 }
|
|
|
634 $inframe = ( abs( $ref_length - $var_length ) % 3 == 0 ? 1 : 0 );
|
|
|
635 }
|
|
|
636
|
|
|
637 my @all_effects; # A list of effects of this variant on all possible transcripts
|
|
|
638 my $maf_effect; # A single effect per variant to report in the standard MAF columns
|
|
|
639 my %maf_line = map{( $_, '' )} @maf_header; # Initialize MAF fields with blank strings
|
|
|
640
|
|
|
641 # VEP provides a comma-delimited list of consequences, with pipe-delim details per consequence
|
|
|
642 # It replaces ',' in details with '&'. We'll assume that all '&'s we see, were formerly commas
|
|
|
643 # "Consequence" might list multiple effects on the same transcript e.g. missense,splice_region
|
|
|
644 if( $info{CSQ} or $info{ANN} ) {
|
|
|
645
|
|
|
646 my $ann_lines = ( $info{CSQ} ? $info{CSQ} : $info{ANN} );
|
|
|
647 foreach my $ann_line ( split( /,/, $ann_lines )) {
|
|
|
648 my $idx = 0;
|
|
|
649 my %effect = map{s/\&/,/g; ( $ann_cols_format[$idx++], ( defined $_ ? $_ : '' ))} split( /\|/, $ann_line );
|
|
|
650
|
|
|
651 # Remove transcript ID from HGVS codon/protein changes, to make it easier on the eye
|
|
|
652 $effect{HGVSc} =~ s/^.*:// if( $effect{HGVSc} );
|
|
|
653 $effect{HGVSp} =~ s/^.*:// if( $effect{HGVSp} );
|
|
|
654
|
|
|
655 # Remove the prefixed HGVSc code in HGVSp, if found
|
|
|
656 $effect{HGVSp} =~ s/^.*\((p\.\S+)\)/$1/ if( $effect{HGVSp} and $effect{HGVSp} =~ m/^c\./ );
|
|
|
657
|
|
|
658 # Sort consequences by decreasing order of severity, and pick the most severe one
|
|
|
659 $effect{Consequence} = join( ",", sort { GetEffectPriority($a) <=> GetEffectPriority($b) } split( ",", $effect{Consequence} ));
|
|
|
660 ( $effect{One_Consequence} ) = split( ",", $effect{Consequence} );
|
|
|
661
|
|
|
662 # When VEP fails to provide any value in Consequence, tag it as an intergenic variant
|
|
|
663 $effect{One_Consequence} = "intergenic_variant" unless( $effect{Consequence} );
|
|
|
664
|
|
|
665 # Create a shorter HGVS protein format using 1-letter codes
|
|
|
666 if( $effect{HGVSp} ) {
|
|
|
667 my $hgvs_p_short = $effect{HGVSp};
|
|
|
668 while( $hgvs_p_short and my ( $find, $replace ) = each %aa3to1 ) {
|
|
|
669 eval "\$hgvs_p_short =~ s{$find}{$replace}g";
|
|
|
670 }
|
|
|
671 $effect{HGVSp_Short} = $hgvs_p_short;
|
|
|
672 }
|
|
|
673
|
|
|
674 # Fix HGVSp_Short, CDS_position, and Protein_position for splice acceptor/donor variants
|
|
|
675 if( $effect{One_Consequence} =~ m/^(splice_acceptor_variant|splice_donor_variant)$/ ) {
|
|
|
676 my ( $c_pos ) = $effect{HGVSc} =~ m/^c.(\d+)/;
|
|
|
677 if( defined $c_pos ) {
|
|
|
678 $c_pos = 1 if( $c_pos < 1 ); # Handle negative cDNA positions used in 5' UTRs
|
|
|
679 my $p_pos = sprintf( "%.0f", ( $c_pos + $c_pos % 3 ) / 3 );
|
|
|
680 $effect{HGVSp_Short} = "p.X" . $p_pos . "_splice";
|
|
|
681 $effect{CDS_position} =~ s/^-(\/\d+)$/$c_pos$1/;
|
|
|
682 $effect{Protein_position} =~ s/^-(\/\d+)$/$p_pos$1/;
|
|
|
683 }
|
|
|
684 }
|
|
|
685
|
|
|
686 # Fix HGVSp_Short for Silent mutations, so it mentions the amino-acid and position
|
|
|
687 if( defined $effect{HGVSp_Short} and $effect{HGVSp_Short} eq "p.=" ) {
|
|
|
688 my ( $p_pos ) = $effect{Protein_position} =~ m/^(\d+)(-\d+)?\/\d+$/;
|
|
|
689 my $aa = $effect{Amino_acids};
|
|
|
690 $effect{HGVSp_Short} = "p.$aa" . $p_pos . "=";
|
|
|
691 }
|
|
|
692
|
|
|
693 # Copy VEP data into MAF fields that don't share the same identifier
|
|
|
694 $effect{Transcript_ID} = $effect{Feature};
|
|
|
695 $effect{Exon_Number} = $effect{EXON};
|
|
|
696 $effect{Hugo_Symbol} = ( $effect{SYMBOL} ? $effect{SYMBOL} : '' );
|
|
|
697
|
|
|
698 # If AF columns from the older VEP are found, rename to the newer ones for consistency
|
|
|
699 my %af_col = qw( GMAF AF AFR_MAF AFR_AF AMR_MAF AMR_AF ASN_MAF ASN_AF EAS_MAF EAS_AF
|
|
|
700 EUR_MAF EUR_AF SAS_MAF SAS_AF AA_MAF AA_AF EA_MAF EA_AF );
|
|
|
701 map { $effect{$af_col{$_}} = $effect{$_} if( defined $effect{$_} )} keys %af_col;
|
|
|
702
|
|
|
703 # If VEP couldn't find this variant in dbSNP/COSMIC/etc., we'll say it's "novel"
|
|
|
704 if( $effect{Existing_variation} ) {
|
|
|
705 # ::NOTE:: If seen in a DB other than dbSNP, this field will remain blank
|
|
|
706 $effect{dbSNP_RS} = join( ",", grep{m/^rs\d+$/} split( /,/, $effect{Existing_variation} ));
|
|
|
707 }
|
|
|
708 else {
|
|
|
709 $effect{dbSNP_RS} = "novel";
|
|
|
710 }
|
|
|
711
|
|
|
712 # Transcript_Length isn't separately reported, but can be parsed out from cDNA_position
|
|
|
713 ( $effect{Transcript_Length} ) = $effect{cDNA_position} =~ m/\/(\d+)$/;
|
|
|
714 $effect{Transcript_Length} = 0 unless( defined $effect{Transcript_Length} );
|
|
|
715
|
|
|
716 # Skip effects on other ALT alleles. If ALLELE_NUM is undefined (e.g. for INFO:SVTYPE), don't skip any
|
|
|
717 push( @all_effects, \%effect ) unless( $effect{ALLELE_NUM} and $effect{ALLELE_NUM} != $var_allele_idx );
|
|
|
718 }
|
|
|
719
|
|
|
720 # Sort effects first by transcript biotype, then by severity, and then by longest transcript
|
|
|
721 @all_effects = sort {
|
|
|
722 GetBiotypePriority( $a->{BIOTYPE} ) <=> GetBiotypePriority( $b->{BIOTYPE} ) ||
|
|
|
723 GetEffectPriority( $a->{One_Consequence} ) <=> GetEffectPriority( $b->{One_Consequence} ) ||
|
|
|
724 $b->{Transcript_Length} <=> $a->{Transcript_Length}
|
|
|
725 } @all_effects;
|
|
|
726
|
|
|
727 # Find the highest priority effect with a gene symbol (usually the first one)
|
|
|
728 my ( $effect_with_gene_name ) = grep { $_->{SYMBOL} } @all_effects;
|
|
|
729 my $maf_gene = $effect_with_gene_name->{SYMBOL} if( $effect_with_gene_name );
|
|
|
730
|
|
|
731 # If the gene has user-defined custom isoform overrides, choose that instead
|
|
|
732 ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{Transcript_ID} and $custom_enst{$_->{Transcript_ID}} } @all_effects;
|
|
|
733
|
|
|
734 # Find the effect on the canonical transcript of that highest priority gene
|
|
|
735 ( $maf_effect ) = grep { $_->{SYMBOL} and $_->{SYMBOL} eq $maf_gene and $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects unless( $maf_effect );
|
|
|
736
|
|
|
737 # If that gene has no canonical transcript tagged, choose the highest priority canonical effect on any gene
|
|
|
738 ( $maf_effect ) = grep { $_->{CANONICAL} and $_->{CANONICAL} eq "YES" } @all_effects unless( $maf_effect );
|
|
|
739
|
|
|
740 # If none of the effects are tagged as canonical, then just report the top priority effect
|
|
|
741 $maf_effect = $all_effects[0] unless( $maf_effect );
|
|
|
742 }
|
|
|
743
|
|
|
744 # Construct the MAF columns from the $maf_effect hash
|
|
|
745 %maf_line = map{( $_, ( $maf_effect->{$_} ? $maf_effect->{$_} : '' ))} @maf_header;
|
|
|
746 $maf_line{Hugo_Symbol} = $maf_effect->{Transcript_ID} unless( $maf_effect->{Hugo_Symbol} );
|
|
|
747 $maf_line{Hugo_Symbol} = 'Unknown' unless( $maf_effect->{Transcript_ID} );
|
|
|
748 $maf_line{Entrez_Gene_Id} = ( defined $entrez_id_map{$maf_effect->{Gene}} ? $entrez_id_map{$maf_effect->{Gene}} : "0" );
|
|
|
749 $maf_line{Center} = $maf_center;
|
|
|
750 $maf_line{NCBI_Build} = $ncbi_build;
|
|
|
751 $maf_line{Chromosome} = $chrom;
|
|
|
752 $maf_line{Start_Position} = $start;
|
|
|
753 $maf_line{End_Position} = $stop;
|
|
|
754 $maf_line{Strand} = '+'; # Per MAF definition, only the positive strand is an accepted value
|
|
|
755 $maf_line{STRAND_VEP} = $maf_effect->{STRAND}; # Renamed to avoid mixup with "Strand" above
|
|
|
756 $maf_line{Variant_Classification} = GetVariantClassification( $maf_effect->{One_Consequence}, $var_type, $inframe );
|
|
|
757 $maf_line{Variant_Type} = $var_type;
|
|
|
758 $maf_line{Reference_Allele} = $ref;
|
|
|
759 # ::NOTE:: If tumor genotype is unavailable, then we'll assume it's ref/var heterozygous
|
|
|
760 $maf_line{Tumor_Seq_Allele1} = $ref;
|
|
|
761 $maf_line{Tumor_Seq_Allele2} = $var;
|
|
|
762 if( defined $tum_info{GT} and $tum_info{GT} ne "." and $tum_info{GT} ne "./." ) {
|
|
|
763 # ::NOTE:: MAF only supports biallelic sites. Tumor_Seq_Allele2 must always be the $var
|
|
|
764 # picked earlier. For Tumor_Seq_Allele1, pick the first non-var allele in GT (usually $ref)
|
|
|
765 my ( $idx1, $idx2 ) = split( /[\/|]/, $tum_info{GT} );
|
|
|
766 # If GT was monoploid, then $idx2 will be undefined, and we should set it equal to $idx1
|
|
|
767 $idx2 = $idx1 unless( defined $idx2 );
|
|
|
768 $maf_line{Tumor_Seq_Allele1} = ( $alleles[$idx1] ne $var ? $alleles[$idx1] : $alleles[$idx2] );
|
|
|
769 }
|
|
|
770 # ::NOTE:: If normal genotype is unavailable, then we'll assume it's ref/ref homozygous
|
|
|
771 $maf_line{Match_Norm_Seq_Allele1} = $ref;
|
|
|
772 $maf_line{Match_Norm_Seq_Allele2} = $ref;
|
|
|
773 if( defined $nrm_info{GT} and $nrm_info{GT} ne "." and $nrm_info{GT} ne "./." ) {
|
|
|
774 # ::NOTE:: MAF only supports biallelic sites. So choose the first two alleles listed in GT
|
|
|
775 my ( $idx1, $idx2 ) = split( /[\/|]/, $nrm_info{GT} );
|
|
|
776 # If GT was monoploid, then $idx2 will be undefined, and we should set it equal to $idx1
|
|
|
777 $idx2 = $idx1 unless( defined $idx2 );
|
|
|
778 $maf_line{Match_Norm_Seq_Allele1} = $alleles[$idx1];
|
|
|
779 $maf_line{Match_Norm_Seq_Allele2} = $alleles[$idx2];
|
|
|
780 }
|
|
|
781 $maf_line{Tumor_Sample_Barcode} = $tumor_id;
|
|
|
782 $maf_line{Matched_Norm_Sample_Barcode} = $normal_id;
|
|
|
783 $maf_line{t_depth} = $tum_info{DP} if( defined $tum_info{DP} and $tum_info{DP} ne "." );
|
|
|
784 ( $maf_line{t_ref_count}, $maf_line{t_alt_count} ) = @tum_depths[0,$var_allele_idx] if( @tum_depths );
|
|
|
785 $maf_line{n_depth} = $nrm_info{DP} if( defined $nrm_info{DP} and $nrm_info{DP} ne "." );
|
|
|
786 ( $maf_line{n_ref_count}, $maf_line{n_alt_count} ) = @nrm_depths[0,$var_allele_idx] if( @nrm_depths );
|
|
|
787
|
|
|
788 # Create a semicolon delimited list summarizing the prioritized effects in @all_effects
|
|
|
789 $maf_line{all_effects} = "";
|
|
|
790 foreach my $effect ( @all_effects ) {
|
|
|
791 my $gene_name = $effect->{Hugo_Symbol};
|
|
|
792 my $effect_type = $effect->{One_Consequence};
|
|
|
793 my $protein_change = ( $effect->{HGVSp} ? $effect->{HGVSp} : '' );
|
|
|
794 my $transcript_id = ( $effect->{Transcript_ID} ? $effect->{Transcript_ID} : '' );
|
|
|
795 my $refseq_ids = ( $effect->{RefSeq} ? $effect->{RefSeq} : '' );
|
|
|
796 $maf_line{all_effects} .= "$gene_name,$effect_type,$protein_change,$transcript_id,$refseq_ids;" if( $effect_type and $transcript_id );
|
|
|
797 }
|
|
|
798
|
|
|
799 # If this variant was seen in the ExAC VCF, let's report allele counts and frequencies
|
|
|
800 # ExAC merges and pads multiallelic sites, so we need to normalize each variant, (remove common
|
|
|
801 # suffixed bps) before we compare it to our variant allele
|
|
|
802 my $locus = "$chrom:$vcf_pos";
|
|
|
803 if( defined $filter_data{$locus} ) {
|
|
|
804 my $idx = 0;
|
|
|
805 $maf_line{ExAC_FILTER} = $filter_data{$locus}{FILTER};
|
|
|
806 foreach my $f_var ( split( ",", $filter_data{$locus}{ALT} )) {
|
|
|
807 my $f_ref = $filter_data{$locus}{REF};
|
|
|
808 # De-pad suffixed bps that are identical between ref/var alleles
|
|
|
809 while( $f_ref and $f_var and substr( $f_ref, -1, 1 ) eq substr( $f_var, -1, 1 ) and $f_ref ne $f_var ) {
|
|
|
810 ( $f_ref, $f_var ) = map{substr( $_, 0, -1 )} ( $f_ref, $f_var );
|
|
|
811 }
|
|
|
812 # If this normalized variant matches our input variant, report its allele counts
|
|
|
813 # ExAC reports MNPs as separate SNPs. So we'll need to report the ACs of the first SNP
|
|
|
814 if(( $vcf_ref eq $f_ref and $vcf_var eq $f_var ) or
|
|
|
815 (( $var_type eq "DNP" or $var_type eq "TNP" or $var_type eq "ONP") and
|
|
|
816 ( $vcf_ref =~ m/^$f_ref/ and $vcf_var =~ m/^$f_var/ ))) {
|
|
|
817 my @var_acs = split( ",", $filter_data{$locus}{AC} );
|
|
|
818 my $var_ac = $var_acs[$idx];
|
|
|
819 my $pop_an = $filter_data{$locus}{AN};
|
|
|
820 $maf_line{ExAC_AF} = sprintf( "%.4g", ( $pop_an ? ( $var_ac / $pop_an ) : 0 ));
|
|
|
821 $maf_line{ExAC_AC_AN} = join( "/", $var_ac, $pop_an );
|
|
|
822 # Do the same for AC/AN in each subpopulation, and the adjusted total AC/AN (Adj)
|
|
|
823 foreach my $subpop ( qw( AFR AMR EAS FIN NFE OTH SAS Adj )) {
|
|
|
824 @var_acs = split( ",", $filter_data{$locus}{"AC_$subpop"} );
|
|
|
825 $var_ac = $var_acs[$idx];
|
|
|
826 $pop_an = $filter_data{$locus}{"AN_$subpop"};
|
|
|
827 $maf_line{"ExAC_AF_$subpop"} = sprintf( "%.4g", ( $pop_an ? ( $var_ac / $pop_an ) : 0 ));
|
|
|
828 $maf_line{"ExAC_AC_AN_$subpop"} = join( "/", $var_ac, $pop_an );
|
|
|
829 }
|
|
|
830 last;
|
|
|
831 }
|
|
|
832 ++$idx;
|
|
|
833 }
|
|
|
834 }
|
|
|
835
|
|
|
836 # Copy FILTER from input VCF, and tag calls with high allele counts in any ExAC subpopulation
|
|
|
837 my $subpop_count = 0;
|
|
|
838 # Remove existing common_variant tags from input, so it's redefined by our criteria here
|
|
|
839 $filter = join( ";", grep{ $_ ne "common_variant" } split( /,|;/, $filter ));
|
|
|
840 foreach my $subpop ( qw( AFR AMR EAS FIN NFE OTH SAS )) {
|
|
|
841 if( $maf_line{"ExAC_AC_AN_$subpop"} ) {
|
|
|
842 my ( $subpop_ac ) = split( "/", $maf_line{"ExAC_AC_AN_$subpop"} );
|
|
|
843 $subpop_count++ if( $subpop_ac > $max_filter_ac );
|
|
|
844 }
|
|
|
845 }
|
|
|
846 if( $subpop_count > 0 ) {
|
|
|
847 $filter = (( $filter eq "PASS" or $filter eq "." or !$filter ) ? "common_variant" : "$filter;common_variant" );
|
|
|
848 }
|
|
|
849 $maf_line{FILTER} = $filter;
|
|
|
850
|
|
|
851 # Also add the reference allele flanking bps that we generated earlier with samtools
|
|
|
852 my $region = "$chrom:" . ( $vcf_pos - 1 ) . "-" . ( $vcf_pos + length( $vcf_ref ));
|
|
|
853 $maf_line{flanking_bps} = $flanking_bps{$region};
|
|
|
854
|
|
|
855 # Add ID and QUAL from the input VCF into respective MAF columns
|
|
|
856 $maf_line{variant_id} = $var_id;
|
|
|
857 $maf_line{variant_qual} = $var_qual;
|
|
|
858
|
|
|
859 # If there are additional INFO data to add, then add those
|
|
|
860 foreach my $info_col ( @addl_info_cols ) {
|
|
|
861 $maf_line{$info_col} = ( defined $info{$info_col} ? $info{$info_col} : "" );
|
|
|
862 }
|
|
|
863
|
|
|
864 # If this is an SV, pair up gene names from separate lines to backfill the Fusion column later
|
|
|
865 if( $split_svs and $var=~m/^<BND|DEL|DUP|INV>$/ ) {
|
|
|
866 my $sv_key = "$var_id-$tumor_id";
|
|
|
867 if( $sv_pair{$sv_key} ) {
|
|
|
868 $sv_pair{$sv_key} = $sv_pair{$sv_key} . "-" . $maf_line{Hugo_Symbol} . " fusion";
|
|
|
869 }
|
|
|
870 else {
|
|
|
871 $sv_pair{$sv_key} = $maf_line{Hugo_Symbol};
|
|
|
872 }
|
|
|
873 }
|
|
|
874
|
|
|
875 # At this point, we've generated all we can about this variant, so write it to the MAF
|
|
|
876 $maf_fh->print( join( "\t", map{( defined $maf_line{$_} ? $maf_line{$_} : "" )} @maf_header ) . "\n" );
|
|
|
877 }
|
|
|
878 $maf_fh->close;
|
|
|
879 $annotated_vcf_fh->close;
|
|
|
880
|
|
|
881 # If the MAF lists SVs, backfill the Fusion column with gene-pair names
|
|
|
882 if( $split_svs ) {
|
|
|
883 my $output_name = substr( $output_maf, rindex( $output_maf, "/" ) + 1 );
|
|
|
884 $output_name =~ s/(\.maf)*$//;
|
|
|
885 my $tmp_output_maf = "$tmp_dir/$output_name.tmp.maf";
|
|
|
886
|
|
|
887 my $in_maf_fh = IO::File->new( $output_maf ) or die "ERROR: Couldn't open: $output_maf!\n";
|
|
|
888 my $out_maf_fh = IO::File->new( $tmp_output_maf, ">" ) or die "ERROR: Couldn't open: $tmp_output_maf!\n";
|
|
|
889 my ( $tid_idx, $fusion_idx, $var_id_idx ) = ( 0, 0, 0 );
|
|
|
890 while( my $line = $in_maf_fh->getline ) {
|
|
|
891 chomp( $line );
|
|
|
892 if( $line =~ m/^#/ ) {
|
|
|
893 $out_maf_fh->print( "$line\n" ); # Copy comments unchanged
|
|
|
894 }
|
|
|
895 elsif( $line =~ m/^Hugo_Symbol/ ) {
|
|
|
896 # Copy the header unchanged, after figuring out necessary column indexes
|
|
|
897 foreach( split( /\t/, $line )) { last if( $_ eq "Tumor_Sample_Barcode" ); ++$tid_idx; }
|
|
|
898 foreach( split( /\t/, $line )) { last if( $_ eq "Fusion" ); ++$fusion_idx; }
|
|
|
899 foreach( split( /\t/, $line )) { last if( $_ eq "variant_id" ); ++$var_id_idx; }
|
|
|
900 $out_maf_fh->print( "$line\n" ); # Copy header unchanged
|
|
|
901 }
|
|
|
902 else {
|
|
|
903 # Write the gene-pair name into the Fusion column if it was backfilled earlier
|
|
|
904 my @cols = split( /\t/, $line, -1 );
|
|
|
905 my $sv_key = $cols[$var_id_idx] . "-" . $cols[$tid_idx];
|
|
|
906 $cols[$fusion_idx] = $sv_pair{$sv_key} if( $sv_pair{$sv_key} );
|
|
|
907 $out_maf_fh->print( join( "\t", @cols ) . "\n" );
|
|
|
908 }
|
|
|
909 }
|
|
|
910 $out_maf_fh->close;
|
|
|
911 $in_maf_fh->close;
|
|
|
912
|
|
|
913 move( $tmp_output_maf, $output_maf );
|
|
|
914 }
|
|
|
915
|
|
|
916 # Converts Sequence Ontology variant types to MAF variant classifications
|
|
|
917 sub GetVariantClassification {
|
|
|
918 my ( $effect, $var_type, $inframe ) = @_;
|
|
|
919 return "Splice_Site" if( $effect =~ /^(splice_acceptor_variant|splice_donor_variant|transcript_ablation|exon_loss_variant)$/ );
|
|
|
920 return "Nonsense_Mutation" if( $effect eq 'stop_gained' );
|
|
|
921 return "Frame_Shift_Del" if(( $effect eq 'frameshift_variant' or ( $effect eq 'protein_altering_variant' and !$inframe )) and $var_type eq 'DEL' );
|
|
|
922 return "Frame_Shift_Ins" if(( $effect eq 'frameshift_variant' or ( $effect eq 'protein_altering_variant' and !$inframe )) and $var_type eq 'INS' );
|
|
|
923 return "Nonstop_Mutation" if( $effect eq 'stop_lost' );
|
|
|
924 return "Translation_Start_Site" if( $effect =~ /^(initiator_codon_variant|start_lost)$/ );
|
|
|
925 return "In_Frame_Ins" if( $effect =~ /^(inframe_insertion|disruptive_inframe_insertion)$/ or ( $effect eq 'protein_altering_variant' and $inframe and $var_type eq 'INS' ));
|
|
|
926 return "In_Frame_Del" if( $effect =~ /^(inframe_deletion|disruptive_inframe_deletion)$/ or ( $effect eq 'protein_altering_variant' and $inframe and $var_type eq 'DEL' ));
|
|
|
927 return "Missense_Mutation" if( $effect =~ /^(missense_variant|coding_sequence_variant|conservative_missense_variant|rare_amino_acid_variant)$/ );
|
|
|
928 return "Intron" if ( $effect =~ /^(transcript_amplification|intron_variant|INTRAGENIC|intragenic_variant)$/ );
|
|
|
929 return "Splice_Region" if( $effect eq 'splice_region_variant' );
|
|
|
930 return "Silent" if( $effect =~ /^(incomplete_terminal_codon_variant|synonymous_variant|stop_retained_variant|NMD_transcript_variant)$/ );
|
|
|
931 return "RNA" if( $effect =~ /^(mature_miRNA_variant|exon_variant|non_coding_exon_variant|non_coding_transcript_exon_variant|non_coding_transcript_variant|nc_transcript_variant)$/ );
|
|
|
932 return "5'UTR" if( $effect =~ /^(5_prime_UTR_variant|5_prime_UTR_premature_start_codon_gain_variant)$/ );
|
|
|
933 return "3'UTR" if( $effect eq '3_prime_UTR_variant' );
|
|
|
934 return "IGR" if( $effect =~ /^(TF_binding_site_variant|regulatory_region_variant|regulatory_region|intergenic_variant|intergenic_region)$/ );
|
|
|
935 return "5'Flank" if( $effect eq 'upstream_gene_variant' );
|
|
|
936 return "3'Flank" if ( $effect eq 'downstream_gene_variant' );
|
|
|
937
|
|
|
938 # Annotate everything else simply as a targeted region
|
|
|
939 # TFBS_ablation, TFBS_amplification,regulatory_region_ablation, regulatory_region_amplification,
|
|
|
940 # feature_elongation, feature_truncation
|
|
|
941 return "Targeted_Region";
|
|
|
942 }
|
|
|
943
|
|
|
944 # Fix the AD and DP fields, given data from a FORMATted genotype string
|
|
|
945 sub FixAlleleDepths {
|
|
|
946 my ( $alleles_ref, $var_allele_idx, $fmt_info_ref ) = @_;
|
|
|
947 my %fmt_info = %{$fmt_info_ref};
|
|
|
948 my @alleles = @{$alleles_ref};
|
|
|
949 my @depths = ();
|
|
|
950
|
|
|
951 # If AD is defined, then parse out all REF/ALT allele depths, or whatever is in it
|
|
|
952 if( defined $fmt_info{AD} and $fmt_info{AD} ne "." ) {
|
|
|
953 @depths = map{( m/^\d+$/ ? $_ : "" )}split( /,/, $fmt_info{AD} );
|
|
|
954 }
|
|
|
955
|
|
|
956 # Handle VarScan VCF lines where AD contains only 1 depth, and REF allele depth is in RD
|
|
|
957 if( scalar( @depths ) == 1 and defined $fmt_info{RD} ) {
|
|
|
958 @depths = map{""} @alleles;
|
|
|
959 $depths[0] = $fmt_info{RD};
|
|
|
960 $depths[$var_allele_idx] = $fmt_info{AD};
|
|
|
961 }
|
|
|
962 # Handle SomaticSniper VCF lines, where allele depths must be extracted from BCOUNT
|
|
|
963 elsif( !defined $fmt_info{AD} and defined $fmt_info{BCOUNT} ) {
|
|
|
964 my %b_idx = ( A=>0, C=>1, G=>2, T=>3 );
|
|
|
965 my @bcount = split( /,/, $fmt_info{BCOUNT} );
|
|
|
966 @depths = map{(( defined $b_idx{$_} and defined $bcount[$b_idx{$_}] ) ? $bcount[$b_idx{$_}] : "" )} @alleles;
|
|
|
967 }
|
|
|
968 # Handle VCF SNV lines by Strelka, where allele depths are in AU:CU:GU:TU
|
|
|
969 elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/AU CU GU TU/ ) == 4 ) {
|
|
|
970 # Strelka allele depths come in tiers 1,2. We'll use tier1 cuz it's stricter, and DP already is
|
|
|
971 map{( $fmt_info{$_.'U'} ) = split( ",", $fmt_info{$_.'U'} )} qw( A C G T );
|
|
|
972
|
|
|
973 # If the only ALT allele is N, then set it to the allele with the highest non-ref readcount
|
|
|
974 if( scalar( @alleles ) == 2 and $alleles[1] eq "N" ) {
|
|
|
975 my %acgt_depths = map{( defined $fmt_info{$_.'U'} ? ( $_, $fmt_info{$_.'U'} ) : ( $_, "" ))} qw( A C G T );
|
|
|
976 my @deepest = sort {$acgt_depths{$b} <=> $acgt_depths{$a}} keys %acgt_depths;
|
|
|
977 ( $alleles[1] ) = ( $deepest[0] ne $alleles[0] ? $deepest[0] : $deepest[1] );
|
|
|
978 }
|
|
|
979 @depths = map{( defined $fmt_info{$_.'U'} ? $fmt_info{$_.'U'} : "" )} @alleles;
|
|
|
980 }
|
|
|
981 # Handle VCF Indel lines by Strelka, where variant allele depth is in TIR
|
|
|
982 elsif( !defined $fmt_info{AD} and $fmt_info{TIR} ) {
|
|
|
983 # Reference allele depth is not provided by Strelka for indels, so we have to skip it
|
|
|
984 @depths = ( "", ( split /,/, $fmt_info{TIR} )[0] );
|
|
|
985 }
|
|
|
986 # Handle VCF lines by CaVEMan, where allele depths are in FAZ:FCZ:FGZ:FTZ:RAZ:RCZ:RGZ:RTZ
|
|
|
987 elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/FAZ FCZ FGZ FTZ RAZ RCZ RGZ RTZ/ ) == 8 ) {
|
|
|
988 # Create tags for forward+reverse strand reads, and use those to determine REF/ALT depths
|
|
|
989 map{ $fmt_info{$_} = $fmt_info{'F'.$_} + $fmt_info{'R'.$_} } qw( AZ CZ GZ TZ );
|
|
|
990 @depths = map{( defined $fmt_info{$_.'Z'} ? $fmt_info{$_.'Z'} : "" )} @alleles;
|
|
|
991 }
|
|
|
992 # Handle VCF lines from the Ion Torrent Suite where ALT depths are in AO and REF depths are in RO
|
|
|
993 elsif( !defined $fmt_info{AD} and defined $fmt_info{AO} and defined $fmt_info{RO} ) {
|
|
|
994 @depths = ( $fmt_info{RO}, map{( m/^\d+$/ ? $_ : "" )}split( /,/, $fmt_info{AO} ));
|
|
|
995 }
|
|
|
996 # Handle VCF lines from Delly where REF/ALT SV junction read counts are in RR/RV respectively
|
|
|
997 elsif( !defined $fmt_info{AD} and defined $fmt_info{RR} and defined $fmt_info{RV} ) {
|
|
|
998 # Reference allele depth and depths for any other ALT alleles must be left undefined
|
|
|
999 @depths = map{""} @alleles;
|
|
|
1000 $depths[0] = $fmt_info{RR};
|
|
|
1001 $depths[$var_allele_idx] = $fmt_info{RV};
|
|
|
1002 }
|
|
|
1003 # Handle VCF lines from cgpPindel, where ALT depth and total depth are in PP:NP:PR:NR
|
|
|
1004 elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/PP NP PR NR/ ) == 4 ) {
|
|
|
1005 # Reference allele depth and depths for any other ALT alleles must be left undefined
|
|
|
1006 @depths = map{""} @alleles;
|
|
|
1007 $depths[$var_allele_idx] = $fmt_info{PP} + $fmt_info{NP};
|
|
|
1008 $fmt_info{DP} = $fmt_info{PR} + $fmt_info{NR};
|
|
|
1009 }
|
|
|
1010 # Handle VCF lines with ALT allele fraction in FA, which needs to be multiplied by DP to get AD
|
|
|
1011 elsif( !defined $fmt_info{AD} and defined $fmt_info{FA} and defined $fmt_info{DP} and $fmt_info{DP} ne '.' ) {
|
|
|
1012 # Reference allele depth and depths for any other ALT alleles must be left undefined
|
|
|
1013 @depths = map{""} @alleles;
|
|
|
1014 $depths[$var_allele_idx] = sprintf( "%.0f", $fmt_info{FA} * $fmt_info{DP} );
|
|
|
1015 }
|
|
|
1016 # Handle VCF lines from mpileup/bcftools where DV contains the ALT allele depth
|
|
|
1017 elsif( !defined $fmt_info{AD} and defined $fmt_info{DV} and defined $fmt_info{DP} ) {
|
|
|
1018 # Reference allele depth and depths for any other ALT alleles must be left undefined
|
|
|
1019 @depths = map{""} @alleles;
|
|
|
1020 $depths[$var_allele_idx] = $fmt_info{DV};
|
|
|
1021 }
|
|
|
1022 # Handle VCF lines where AD contains only 1 value, that we can assume is the variant allele
|
|
|
1023 elsif( defined $fmt_info{AD} and @depths and scalar( @depths ) == 1 ) {
|
|
|
1024 # Reference allele depth and depths for any other ALT alleles must be left undefined
|
|
|
1025 @depths = map{""} @alleles;
|
|
|
1026 $depths[$var_allele_idx] = $fmt_info{AD};
|
|
|
1027 }
|
|
|
1028 # For all other lines where #depths is not equal to #alleles, blank out the depths
|
|
|
1029 elsif( @depths and scalar( @depths ) ne scalar( @alleles )) {
|
|
|
1030 @depths = map{""} @alleles;
|
|
|
1031 }
|
|
|
1032
|
|
|
1033 # Sanity check that REF/ALT allele depths are lower than the total depth
|
|
|
1034 if( defined $fmt_info{DP} and $fmt_info{DP} ne '.' and (( $depths[0] and $depths[0] > $fmt_info{DP} ) or
|
|
|
1035 ( $depths[$var_allele_idx] and $depths[$var_allele_idx] > $fmt_info{DP} ) or
|
|
|
1036 ( $depths[0] and $depths[$var_allele_idx] and $depths[0] + $depths[$var_allele_idx] > $fmt_info{DP} ))) {
|
|
|
1037 $fmt_info{DP} = 0;
|
|
|
1038 map{$fmt_info{DP} += $_ if($_ and $_ ne '.')} @depths;
|
|
|
1039 }
|
|
|
1040
|
|
|
1041 # If we have REF/ALT allele depths, but no DP, then set DP equal to the sum of all ADs
|
|
|
1042 if(( defined $depths[0] and defined $depths[$var_allele_idx] ) and ( !defined $fmt_info{DP} or $fmt_info{DP} eq '.' )) {
|
|
|
1043 $fmt_info{DP} = 0;
|
|
|
1044 map{$fmt_info{DP} += $_ if($_ and $_ ne '.')} @depths;
|
|
|
1045 }
|
|
|
1046
|
|
|
1047 # Put all our changes back into the hash/array references that were passed over
|
|
|
1048 $fmt_info{AD} = join( ",", map{( $_ ne "" ? $_ : "." )} @depths );
|
|
|
1049 %{$fmt_info_ref} = %fmt_info;
|
|
|
1050 @{$alleles_ref} = @alleles;
|
|
|
1051
|
|
|
1052 return 1;
|
|
|
1053 }
|
|
|
1054
|
|
|
1055 __DATA__
|
|
|
1056
|
|
|
1057 =head1 NAME
|
|
|
1058
|
|
|
1059 vcf2maf.pl - Convert a VCF into a MAF by mapping each variant to only one of all possible gene isoforms
|
|
|
1060
|
|
|
1061 =head1 SYNOPSIS
|
|
|
1062
|
|
|
1063 perl vcf2maf.pl --help
|
|
|
1064 perl vcf2maf.pl --input-vcf WD4086.vcf --output-maf WD4086.maf --tumor-id WD4086 --normal-id NB4086
|
|
|
1065
|
|
|
1066 =head1 OPTIONS
|
|
|
1067
|
|
|
1068 --input-vcf Path to input file in VCF format
|
|
|
1069 --output-maf Path to output MAF file
|
|
|
1070 --tmp-dir Folder to retain intermediate VCFs after runtime [Default: Folder containing input VCF]
|
|
|
1071 --tumor-id Tumor_Sample_Barcode to report in the MAF [TUMOR]
|
|
|
1072 --normal-id Matched_Norm_Sample_Barcode to report in the MAF [NORMAL]
|
|
|
1073 --vcf-tumor-id Tumor sample ID used in VCF's genotype columns [--tumor-id]
|
|
|
1074 --vcf-normal-id Matched normal ID used in VCF's genotype columns [--normal-id]
|
|
|
1075 --custom-enst List of custom ENST IDs that override canonical selection
|
|
|
1076 --vep-path Folder containing the vep script [~/vep]
|
|
|
1077 --vep-data VEP's base cache/plugin directory [~/.vep]
|
|
|
1078 --vep-forks Number of forked processes to use when running VEP [4]
|
|
|
1079 --buffer-size Number of variants VEP loads at a time; Reduce this for low memory systems [5000]
|
|
|
1080 --any-allele When reporting co-located variants, allow mismatched variant alleles too
|
|
|
1081 --ref-fasta Reference FASTA file [~/.vep/homo_sapiens/91_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz]
|
|
|
1082 --filter-vcf A VCF for FILTER tag common_variant. Set to 0 to disable [~/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz]
|
|
|
1083 --max-filter-ac Use tag common_variant if the filter-vcf reports a subpopulation AC higher than this [10]
|
|
|
1084 --species Ensembl-friendly name of species (e.g. mus_musculus for mouse) [homo_sapiens]
|
|
|
1085 --ncbi-build NCBI reference assembly of variants MAF (e.g. GRCm38 for mouse) [GRCh37]
|
|
|
1086 --cache-version Version of offline cache to use with VEP (e.g. 75, 84, 91) [Default: Installed version]
|
|
|
1087 --maf-center Variant calling center to report in MAF [.]
|
|
|
1088 --retain-info Comma-delimited names of INFO fields to retain as extra columns in MAF []
|
|
|
1089 --min-hom-vaf If GT undefined in VCF, minimum allele fraction to call a variant homozygous [0.7]
|
|
|
1090 --remap-chain Chain file to remap variants to a different assembly before running VEP
|
|
|
1091 --help Print a brief help message and quit
|
|
|
1092 --man Print the detailed manual
|
|
|
1093
|
|
|
1094 =head1 DESCRIPTION
|
|
|
1095
|
|
|
1096 To convert a VCF into a MAF, each variant must be mapped to only one of all possible gene transcripts/isoforms that it might affect. This selection of a single effect per variant, is often subjective. So this project is an attempt to make the selection criteria smarter, reproducible, and more configurable.
|
|
|
1097
|
|
|
1098 This script needs VEP, a variant annotator that maps effects of a variant on all possible genes and transcripts. For more info, see the README.
|
|
|
1099
|
|
|
1100 =head2 Relevant links:
|
|
|
1101
|
|
|
1102 Homepage: https://github.com/ckandoth/vcf2maf
|
|
|
1103 VCF format: http://samtools.github.io/hts-specs/
|
|
|
1104 MAF format: https://wiki.nci.nih.gov/x/eJaPAQ
|
|
|
1105 VEP: http://ensembl.org/info/docs/tools/vep/index.html
|
|
|
1106 VEP annotated VCF format: http://ensembl.org/info/docs/tools/vep/vep_formats.html#vcfout
|
|
|
1107
|
|
|
1108 =head1 AUTHORS
|
|
|
1109
|
|
|
1110 Cyriac Kandoth (ckandoth@gmail.com)
|
|
|
1111 Shweta Chavan (chavan.shweta@gmail.com)
|
|
|
1112
|
|
|
1113 =head1 LICENSE
|
|
|
1114
|
|
|
1115 Apache-2.0 | Apache License, Version 2.0 | https://www.apache.org/licenses/LICENSE-2.0
|
|
|
1116
|
|
|
1117 =cut
|