diff fpfilter.pl @ 0:276875076be1 draft

Uploaded
author elixir-it
date Tue, 03 Jul 2018 06:05:44 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fpfilter.pl	Tue Jul 03 06:05:44 2018 -0400
@@ -0,0 +1,596 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use IO::File;
+use File::Temp qw( tempdir );
+use File::Spec;
+use Cwd qw( abs_path cwd);
+
+## Define filtering parameters ##
+
+my $min_read_pos = 0.10;
+my $max_read_pos = 1 - $min_read_pos;
+my $min_var_freq = 0.05;
+my $min_var_count = 4;
+
+my $min_strandedness = 0.01;
+my $max_strandedness = 1 - $min_strandedness;
+
+my $max_mm_qualsum_diff = 50;
+my $max_mapqual_diff = 30;
+my $max_readlen_diff = 25;
+my $min_var_dist_3 = 0.20;
+my $max_var_mm_qualsum = 100;
+
+
+## Parse arguments ##
+
+my $output_basename;
+
+my $vcf_file;
+my $output_file;
+my $bam_file;
+my $bam_index;
+my $sample;
+my $bam_readcount_path = 'bam-readcount';
+my $samtools_path = 'samtools';
+my $ref_fasta;
+my $help;
+
+
+my $opt_result;
+my @params = @ARGV;
+
+$opt_result = GetOptions(
+    'vcf-file=s' => \$vcf_file,
+    'bam-file=s' => \$bam_file,
+    'bam-index=s' => \$bam_index,
+    'sample=s' => \$sample,
+    'bam-readcount=s' => \$bam_readcount_path,
+    'samtools=s' => \$samtools_path,
+    'reference=s' => \$ref_fasta,
+    'output=s'   => \$output_file,
+    'min-read-pos=f' => \$min_read_pos,
+    'min-var-freq=f' => \$min_var_freq,
+    'min-var-count=f' => \$min_var_count,
+    'min-strandedness=f' => \$min_strandedness,
+    'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
+    'max-mapqual-diff=f' => \$max_mapqual_diff,
+    'max-readlen-diff=f' => \$max_readlen_diff,
+    'min-var-dist-3=f' => \$min_var_dist_3,
+    'max-var-mm-qualsum=f' => \$max_var_mm_qualsum,
+    'help' => \$help,
+);
+
+unless($opt_result) {
+    die help_text();
+}
+
+if($help) {
+    print STDOUT help_text();
+    exit 0;
+}
+
+unless($vcf_file) {
+    warn "You must provide a file to be filtered\n";
+    die help_text();
+}
+
+unless(-s $vcf_file) {
+    die "Can not find VCF file: $vcf_file\n";
+}
+
+unless($ref_fasta) {
+    warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
+    die help_text();
+}
+
+unless(-s $ref_fasta) {
+    die "Can not find valid reference fasta: $ref_fasta\n";
+}
+
+unless($bam_file) {
+    die "You must provide a BAM file for generating readcounts\n";
+    die help_text();
+}
+
+unless(-s $bam_file) {
+    die "Can not find valid BAM file: $bam_file\n";
+}
+
+if($bam_index && !-s $bam_index) {
+    die "Can not find valid BAM index file: $bam_index\n";
+}
+
+unless($output_file) {
+    warn "You must provide an output file name: $output_file\n";
+    die help_text();
+}
+else {
+    $output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
+}
+
+unless($sample) {
+    warn "You must provide a sample name\n";
+    die help_text();
+}
+
+my %filters;
+$filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"];
+$filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
+$filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
+$filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
+$filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
+$filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
+$filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
+$filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
+$filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
+$filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
+$filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
+
+my $vcf_header;
+my @vcf_lines;
+
+my %rc_for_snp; # store info on snp positions and VCF line
+my %rc_for_indel; #store info on indel positions and VCF line
+
+my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
+
+my $starting_dir = cwd();
+
+my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
+$vcf_header = parse_vcf_header($input);
+add_filters_to_vcf_header($vcf_header, values %filters);
+add_process_log_to_header($vcf_header, $vcf_file, @params);
+
+my $header_line = $vcf_header->[-1];
+chomp $header_line;
+my @header_fields = split "\t", $header_line;
+
+while(my $entry = $input->getline) {
+    push @vcf_lines, $entry;
+    chomp $entry;
+
+    my %fields;
+    @fields{@header_fields} = split("\t", $entry);
+    
+    my $filter_sample = $fields{$sample};
+    unless($filter_sample) {
+        die "Unable to find field for $sample\n";
+    }
+    my @sample_fields = split /:/, $filter_sample;
+    unless(@sample_fields) {
+        die "Unable to parse field for $sample\n";
+    }
+    my $index = 0;
+    my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
+    #these are in order ACGT
+    my @alleles = ($fields{REF}, split /,/, $fields{ALT});
+    my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
+    my @used_alleles;
+    for my $allele_index (keys %gt_alleles) {
+        push @used_alleles, $alleles[$allele_index];
+    }
+    my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
+    $var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
+    $var = uc($var);
+    my $ref = uc($fields{REF});
+    my $chrom = $fields{'#CHROM'};
+    my $pos = $fields{POS};
+
+    if(length($ref) > 1 || length($var) > 1) {
+        #it's an indel or mnp
+        if(length($ref) == length($var)) {
+            die "MNPs unsupported\n";
+        }
+        elsif(length($ref) > length($var)) {
+            #it's a deletion
+            $pos += 1;
+            ($ref, $var) = ($var, $ref);
+            $ref = substr($var, 1, 1);
+            $var = "-" . substr($var, 1);
+        }
+        else {
+            #it's an insertion
+            substr($var, 0, 1, "+");
+        }
+        $rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
+    }
+    else {
+        #it's a SNP
+        $rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
+    }
+}
+
+if(%rc_for_snp) {
+    filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
+}
+else {
+    print STDERR "No SNP sites identified\n";
+}
+
+if(%rc_for_indel) {
+    filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
+}
+else {
+    print STDERR "No Indel sites identified\n";
+}
+
+## Open the output files ##
+my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
+print $filtered_vcf @$vcf_header;
+print $filtered_vcf @vcf_lines;
+
+chdir $starting_dir or die "Unable to go back to starting dir\n";
+exit(0);
+
+################################################################################
+
+=head3	read_counts_by_allele
+
+    Retrieve relevant read counts for a certain allele 
+
+
+=cut
+
+sub read_counts_by_allele {
+    (my $line, my $allele) = @_;
+
+    my @lineContents = split(/\t/, $line);
+    my $numContents = @lineContents;
+
+    for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
+        my $this_allele = $lineContents[$colCounter];
+        my @alleleContents = split(/\:/, $this_allele);
+        if($alleleContents[0] eq $allele) {
+            my $numAlleleContents = @alleleContents;
+
+            return("") if($numAlleleContents < 8);
+
+            my $return_string = "";
+            my $return_sum = 0;
+            for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
+                $return_sum += $alleleContents[$printCounter];
+                $return_string .= "\t" if($return_string);
+                $return_string .= $alleleContents[$printCounter];
+            }
+
+            return($return_string);
+
+        }
+    }
+
+    return("");
+}
+
+
+sub help_text {
+    return <<HELP;
+fpfilter - Filtering for Illumina Sequencing
+
+SYNOPSIS
+fpfilter [options] [file ...]
+
+OPTIONS
+--vcf-file              the input VCF file. Must have a GT field.
+--bam-file              the BAM file of the sample you are filtering on. Typically the tumor.
+--sample                the sample name of the sample you want to filter on in the VCF file.
+--reference             a fasta containing the reference sequence the BAM file was aligned to.
+--output                the filename of the output VCF file
+--min-read-pos          minimum average relative distance from start/end of read 
+--min-var-freq          minimum variant allele frequency
+--min-var-count         minimum number of variant-supporting reads
+--min-strandedness      minimum representation of variant allele on each strand
+--max-mm-qualsum-diff   maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
+--max_var_mm_qualsum    maximum mismatch quality sum of reference-supporting reads
+--max-mapqual-diff      maximum difference of mapping quality between variant and reference reads
+--max-readlen-diff      maximum difference of average supporting read length between variant and reference reads (paralog filter)
+--min-var-dist-3        minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
+--help                  this message
+
+DESCRIPTION
+This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount). 
+
+This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
+
+AUTHORS
+Dan Koboldt     Original code
+Dave Larson     Modifications for VCF and exportation.
+
+HELP
+}
+
+### methods copied from elsewhere begin here...
+sub parse_vcf_header {
+    my $input_fh = shift;
+
+    my @header;
+    my $header_end = 0;
+    while (!$header_end) {
+        my $line = $input_fh->getline;
+        if ($line =~ m/^##/) {
+            push @header, $line;
+        } elsif ($line =~ m/^#/) {
+            push @header, $line;
+            $header_end = 1;
+        } else {
+            die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
+        }
+    }
+    return \@header;
+}
+
+sub generate_region_list {
+    my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
+    print STDERR "Printing variants to temporary region_list file...\n";
+    for my $chr (keys %$hash) {
+        for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
+            print $region_fh "$chr\t$pos\t$pos\n";
+        }
+    }
+}
+
+sub _simplify_indel_allele {
+    my ($ref, $var) = @_;
+    #these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
+    #they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
+    #they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
+    #
+    #I think the algorithm should be trim end (no updating of coords required)
+    #trim beginning and return number of bases trimmed
+
+    my @ref_array = map { uc } split //, $ref;
+    my @var_array = map { uc } split //, $var;
+
+    while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
+        pop @ref_array;
+        pop @var_array;
+    }
+
+    my $right_shift = 0;
+    while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
+        shift @ref_array;
+        shift @var_array;
+        $right_shift++;
+    }
+
+    return (join("",@ref_array), join("",@var_array), $right_shift);
+}
+
+sub filter_site {
+    my ($ref_result, $var_result) = @_;
+    #this will return a list of filter names
+    my @filter_names;
+    if($ref_result && $var_result) {
+        ## Parse out the bam-readcounts details for each allele. The fields should be: ##
+        #num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
+        my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
+        my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
+
+        my $ref_strandedness = my $var_strandedness = 0.50;
+        $ref_dist_3 = 0.5 if(!$ref_dist_3);
+
+        ## Use conservative defaults if we can't get mismatch quality sums ##
+        $ref_mmqs = 50 if(!$ref_mmqs);
+        $var_mmqs = 0 if(!$var_mmqs);
+        my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
+
+        ## Determine map qual diff ##
+
+        my $mapqual_diff = $ref_map_qual - $var_map_qual;
+
+
+        ## Determine difference in average supporting read length ##
+
+        my $readlen_diff = $ref_avg_rl - $var_avg_rl;
+
+
+        ## Determine ref strandedness ##
+
+        if(($ref_plus + $ref_minus) > 0) {
+            $ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
+            $ref_strandedness = sprintf("%.2f", $ref_strandedness);
+        }
+
+        ## Determine var strandedness ##
+
+        if(($var_plus + $var_minus) > 0) {
+            $var_strandedness = $var_plus / ($var_plus + $var_minus);
+            $var_strandedness = sprintf("%.2f", $var_strandedness);
+        }
+
+        if($var_count && ($var_plus + $var_minus)) {
+            ## We must obtain variant read counts to proceed ##
+
+            my $var_freq = $var_count / ($ref_count + $var_count);
+
+            ## FAILURE 1: READ POSITION ##
+            if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) {
+                #$stats{'num_fail_pos'}++;
+                push @filter_names, $filters{'position'}->[0];
+            }
+
+            ## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
+            if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
+                #$stats{'num_fail_strand'}++;
+                push @filter_names, $filters{'strand_bias'}->[0];
+            }
+
+            ## FAILURE : Variant allele count does not meet minimum ##
+            if($var_count < $min_var_count) {
+                #$stats{'num_fail_varcount'}++;
+                push @filter_names, $filters{'min_var_count'}->[0];
+            }
+
+            ## FAILURE : Variant allele frequency does not meet minimum ##
+            if($var_freq < $min_var_freq) {
+                #$stats{'num_fail_varfreq'}++;
+                push @filter_names, $filters{'min_var_freq'}->[0];
+            }
+
+            ## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
+            if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
+                #$stats{'num_fail_mmqs'}++;
+                push @filter_names, $filters{'mmqs_diff'}->[0];
+            }
+
+            ## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
+            if($mapqual_diff > $max_mapqual_diff) {
+                #$stats{'num_fail_mapqual'}++;
+                push @filter_names, $filters{'mq_diff'}->[0];
+            }
+
+            ## FAILURE 5: Read length difference exceeds allowable maximum ##
+            if($readlen_diff > $max_readlen_diff) {
+                #$stats{'num_fail_readlen'}++;
+                push @filter_names, $filters{'read_length'}->[0];
+            }
+
+            ## FAILURE 5: Read length difference exceeds allowable maximum ##
+            if($var_dist_3 < $min_var_dist_3) {
+                #$stats{'num_fail_dist3'}++;
+                push @filter_names, $filters{'dist3'}->[0];
+            }
+
+            if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
+                #$stats{'num_fail_var_mmqs'}++;
+                push @filter_names, $filters{'var_mmqs'}->[0];
+            }
+
+            ## SUCCESS: Pass Filter ##
+            if(@filter_names == 0) {
+                #$stats{'num_pass_filter'}++;
+                ## Print output, and append strandedness information ##
+                @filter_names = ('PASS');
+            }
+
+        }
+        else {
+            push @filter_names, $filters{'no_var_readcount'}->[0];
+        }
+    }
+    else {
+        #$stats{'num_no_readcounts'}++;
+        #print $fail_fh "$line\tno_readcounts\n";
+        push @filter_names, $filters{'incomplete_readcount'}->[0];
+    }
+    return @filter_names;
+}
+
+sub add_filters_to_vcf_header {
+    my ($parsed_header, @filter_refs) = @_;
+    my $column_header = pop @$parsed_header;
+    for my $filter_ref (@filter_refs) {
+        my ($filter_name, $filter_description) = @$filter_ref;
+        my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
+        push @$parsed_header, $filter_line;
+    }
+    push @$parsed_header, $column_header;
+}
+
+sub add_process_log_to_header {
+    my ($parsed_header, $input, @params) = @_;
+    my $column_header = pop @$parsed_header;
+    my $param_string = join(" ", @params);
+    push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
+}
+
+sub filter_sites_in_hash {
+    my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
+    #done parsing vcf
+    $optional_param ||= '';
+    my $list_name = File::Spec->catfile($working_dir, "regions.txt");
+    my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
+    generate_region_list($hash, $list_fh);
+    $list_fh->close();
+
+## run bam-readcount
+    my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
+    my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
+    while(my $rc_line = $rc_results->getline) {
+        chomp $rc_line;
+        my ($chrom, $position) = split(/\t/, $rc_line);
+        if($hash->{$chrom}{$position}) {
+            for my $ref (keys %{$hash->{$chrom}{$position}}) {
+                for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
+                    my $ref_result = read_counts_by_allele($rc_line, $ref);
+                    my $var_result = read_counts_by_allele($rc_line, $var);
+                    my @filters = filter_site($ref_result, $var_result);
+
+                    my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
+                    my @fields = split "\t", $$vcf_line_ref;
+                    if($fields[6] eq '.' || $fields[6] eq 'PASS') {
+                        $fields[6] = join(";", @filters);
+                    }
+                    else {
+                        $fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
+                    }
+                    $$vcf_line_ref = join("\t", @fields);
+                }
+            }
+        }
+        else {
+            die "Unknown site for rc\n";
+        }
+    }
+    unless($rc_results->close) {
+        die "Error running bam-readcount\n";
+    }
+}
+
+sub setup_workdir {
+    my ($reference, $bam_file, $bam_index) = @_;
+    $reference = abs_path($reference);
+    $bam_file = abs_path($bam_file);
+    $bam_index = abs_path($bam_index) if $bam_index;
+
+    my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or 
+        die "Unable to create working directory\n";
+
+    #symlink in necessary files to run
+    my $working_reference =  File::Spec->catfile($dir, "reference.fa");
+    symlink $reference, $working_reference;
+
+    my $fa_index = $reference . ".fai";
+    unless(-e $fa_index) {
+        index_fasta($working_reference);
+    }
+    else {
+        symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
+    }
+
+    my $working_bam = File::Spec->catfile($dir, "tumor.bam");
+    my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
+    symlink $bam_file, $working_bam;
+    if($bam_index) {
+        symlink $bam_index, $working_bam_index;
+    }
+    elsif(-e $bam_file . ".bai") {
+        symlink $bam_file . ".bai", $working_bam_index;
+    }
+    else {
+        index_bam($working_bam);
+    }
+    return ($dir, $working_reference, $working_bam);
+}
+
+sub index_fasta {
+    my ($fasta) = @_;
+
+    print STDERR "Indexing fasta...\n";
+    my @args = ($samtools_path, "faidx", $fasta);
+    system(@args) == 0
+        or die "Unable to index $fasta: $?\n";
+}
+
+sub index_bam {
+    my ($bam) = @_;
+
+    print STDERR "Indexing BAM...\n";
+    my @args = ($samtools_path, "index", $bam);
+    system(@args) == 0
+        or die "Unable to index $bam: $?\n";
+}