fpfilter: fpfilter.pl comparison

comparison fpfilter.pl @ 0:276875076be1 draft

Uploaded

author	elixir-it
date	Tue, 03 Jul 2018 06:05:44 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:276875076be1
+#!/usr/bin/env perl
+use warnings;
+use strict;
+use Getopt::Long;
+use IO::File;
+use File::Temp qw( tempdir );
+use File::Spec;
+use Cwd qw( abs_path cwd);
+## Define filtering parameters ##
+my $min_read_pos = 0.10;
+my $max_read_pos = 1 - $min_read_pos;
+my $min_var_freq = 0.05;
+my $min_var_count = 4;
+my $min_strandedness = 0.01;
+my $max_strandedness = 1 - $min_strandedness;
+my $max_mm_qualsum_diff = 50;
+my $max_mapqual_diff = 30;
+my $max_readlen_diff = 25;
+my $min_var_dist_3 = 0.20;
+my $max_var_mm_qualsum = 100;
+## Parse arguments ##
+my $output_basename;
+my $vcf_file;
+my $output_file;
+my $bam_file;
+my $bam_index;
+my $sample;
+my $bam_readcount_path = 'bam-readcount';
+my $samtools_path = 'samtools';
+my $ref_fasta;
+my $help;
+my $opt_result;
+my @params = @ARGV;
+$opt_result = GetOptions(
+'vcf-file=s' => \$vcf_file,
+'bam-file=s' => \$bam_file,
+'bam-index=s' => \$bam_index,
+'sample=s' => \$sample,
+'bam-readcount=s' => \$bam_readcount_path,
+'samtools=s' => \$samtools_path,
+'reference=s' => \$ref_fasta,
+'output=s'   => \$output_file,
+'min-read-pos=f' => \$min_read_pos,
+'min-var-freq=f' => \$min_var_freq,
+'min-var-count=f' => \$min_var_count,
+'min-strandedness=f' => \$min_strandedness,
+'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
+'max-mapqual-diff=f' => \$max_mapqual_diff,
+'max-readlen-diff=f' => \$max_readlen_diff,
+'min-var-dist-3=f' => \$min_var_dist_3,
+'max-var-mm-qualsum=f' => \$max_var_mm_qualsum,
+'help' => \$help,
+);
+unless($opt_result) {
+die help_text();
+}
+if($help) {
+print STDOUT help_text();
+exit 0;
+}
+unless($vcf_file) {
+warn "You must provide a file to be filtered\n";
+die help_text();
+}
+unless(-s $vcf_file) {
+die "Can not find VCF file: $vcf_file\n";
+}
+unless($ref_fasta) {
+warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
+die help_text();
+}
+unless(-s $ref_fasta) {
+die "Can not find valid reference fasta: $ref_fasta\n";
+}
+unless($bam_file) {
+die "You must provide a BAM file for generating readcounts\n";
+die help_text();
+}
+unless(-s $bam_file) {
+die "Can not find valid BAM file: $bam_file\n";
+}
+if($bam_index && !-s $bam_index) {
+die "Can not find valid BAM index file: $bam_index\n";
+}
+unless($output_file) {
+warn "You must provide an output file name: $output_file\n";
+die help_text();
+}
+else {
+$output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
+}
+unless($sample) {
+warn "You must provide a sample name\n";
+die help_text();
+}
+my %filters;
+$filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"];
+$filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
+$filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
+$filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
+$filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
+$filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
+$filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
+$filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
+$filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
+$filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
+$filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
+my $vcf_header;
+my @vcf_lines;
+my %rc_for_snp; # store info on snp positions and VCF line
+my %rc_for_indel; #store info on indel positions and VCF line
+my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
+my $starting_dir = cwd();
+my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
+$vcf_header = parse_vcf_header($input);
+add_filters_to_vcf_header($vcf_header, values %filters);
+add_process_log_to_header($vcf_header, $vcf_file, @params);
+my $header_line = $vcf_header->[-1];
+chomp $header_line;
+my @header_fields = split "\t", $header_line;
+while(my $entry = $input->getline) {
+push @vcf_lines, $entry;
+chomp $entry;
+my %fields;
+@fields{@header_fields} = split("\t", $entry);
+my $filter_sample = $fields{$sample};
+unless($filter_sample) {
+die "Unable to find field for $sample\n";
+}
+my @sample_fields = split /:/, $filter_sample;
+unless(@sample_fields) {
+die "Unable to parse field for $sample\n";
+}
+my $index = 0;
+my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
+#these are in order ACGT
+my @alleles = ($fields{REF}, split /,/, $fields{ALT});
+my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
+my @used_alleles;
+for my $allele_index (keys %gt_alleles) {
+push @used_alleles, $alleles[$allele_index];
+}
+my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
+$var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
+$var = uc($var);
+my $ref = uc($fields{REF});
+my $chrom = $fields{'#CHROM'};
+my $pos = $fields{POS};
+if(length($ref) > 1 || length($var) > 1) {
+#it's an indel or mnp
+if(length($ref) == length($var)) {
+die "MNPs unsupported\n";
+}
+elsif(length($ref) > length($var)) {
+#it's a deletion
+$pos += 1;
+($ref, $var) = ($var, $ref);
+$ref = substr($var, 1, 1);
+$var = "-" . substr($var, 1);
+}
+else {
+#it's an insertion
+substr($var, 0, 1, "+");
+}
+$rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
+}
+else {
+#it's a SNP
+$rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
+}
+}
+if(%rc_for_snp) {
+filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
+}
+else {
+print STDERR "No SNP sites identified\n";
+}
+if(%rc_for_indel) {
+filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
+}
+else {
+print STDERR "No Indel sites identified\n";
+}
+## Open the output files ##
+my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
+print $filtered_vcf @$vcf_header;
+print $filtered_vcf @vcf_lines;
+chdir $starting_dir or die "Unable to go back to starting dir\n";
+exit(0);
+################################################################################
+=head3	read_counts_by_allele
+Retrieve relevant read counts for a certain allele
+=cut
+sub read_counts_by_allele {
+(my $line, my $allele) = @_;
+my @lineContents = split(/\t/, $line);
+my $numContents = @lineContents;
+for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
+my $this_allele = $lineContents[$colCounter];
+my @alleleContents = split(/\:/, $this_allele);
+if($alleleContents[0] eq $allele) {
+my $numAlleleContents = @alleleContents;
+return("") if($numAlleleContents < 8);
+my $return_string = "";
+my $return_sum = 0;
+for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
+$return_sum += $alleleContents[$printCounter];
+$return_string .= "\t" if($return_string);
+$return_string .= $alleleContents[$printCounter];
+}
+return($return_string);
+}
+}
+return("");
+}
+sub help_text {
+return <<HELP;
+fpfilter - Filtering for Illumina Sequencing
+SYNOPSIS
+fpfilter [options] [file ...]
+OPTIONS
+--vcf-file              the input VCF file. Must have a GT field.
+--bam-file              the BAM file of the sample you are filtering on. Typically the tumor.
+--sample                the sample name of the sample you want to filter on in the VCF file.
+--reference             a fasta containing the reference sequence the BAM file was aligned to.
+--output                the filename of the output VCF file
+--min-read-pos          minimum average relative distance from start/end of read
+--min-var-freq          minimum variant allele frequency
+--min-var-count         minimum number of variant-supporting reads
+--min-strandedness      minimum representation of variant allele on each strand
+--max-mm-qualsum-diff   maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
+--max_var_mm_qualsum    maximum mismatch quality sum of reference-supporting reads
+--max-mapqual-diff      maximum difference of mapping quality between variant and reference reads
+--max-readlen-diff      maximum difference of average supporting read length between variant and reference reads (paralog filter)
+--min-var-dist-3        minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
+--help                  this message
+DESCRIPTION
+This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount).
+This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
+AUTHORS
+Dan Koboldt     Original code
+Dave Larson     Modifications for VCF and exportation.
+HELP
+}
+### methods copied from elsewhere begin here...
+sub parse_vcf_header {
+my $input_fh = shift;
+my @header;
+my $header_end = 0;
+while (!$header_end) {
+my $line = $input_fh->getline;
+if ($line =~ m/^##/) {
+push @header, $line;
+} elsif ($line =~ m/^#/) {
+push @header, $line;
+$header_end = 1;
+} else {
+die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
+}
+}
+return \@header;
+}
+sub generate_region_list {
+my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
+print STDERR "Printing variants to temporary region_list file...\n";
+for my $chr (keys %$hash) {
+for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
+print $region_fh "$chr\t$pos\t$pos\n";
+}
+}
+}
+sub _simplify_indel_allele {
+my ($ref, $var) = @_;
+#these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
+#they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
+#they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
+#
+#I think the algorithm should be trim end (no updating of coords required)
+#trim beginning and return number of bases trimmed
+my @ref_array = map { uc } split //, $ref;
+my @var_array = map { uc } split //, $var;
+while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
+pop @ref_array;
+pop @var_array;
+}
+my $right_shift = 0;
+while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
+shift @ref_array;
+shift @var_array;
+$right_shift++;
+}
+return (join("",@ref_array), join("",@var_array), $right_shift);
+}
+sub filter_site {
+my ($ref_result, $var_result) = @_;
+#this will return a list of filter names
+my @filter_names;
+if($ref_result && $var_result) {
+## Parse out the bam-readcounts details for each allele. The fields should be: ##
+#num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
+my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
+my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
+my $ref_strandedness = my $var_strandedness = 0.50;
+$ref_dist_3 = 0.5 if(!$ref_dist_3);
+## Use conservative defaults if we can't get mismatch quality sums ##
+$ref_mmqs = 50 if(!$ref_mmqs);
+$var_mmqs = 0 if(!$var_mmqs);
+my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
+## Determine map qual diff ##
+my $mapqual_diff = $ref_map_qual - $var_map_qual;
+## Determine difference in average supporting read length ##
+my $readlen_diff = $ref_avg_rl - $var_avg_rl;
+## Determine ref strandedness ##
+if(($ref_plus + $ref_minus) > 0) {
+$ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
+$ref_strandedness = sprintf("%.2f", $ref_strandedness);
+}
+## Determine var strandedness ##
+if(($var_plus + $var_minus) > 0) {
+$var_strandedness = $var_plus / ($var_plus + $var_minus);
+$var_strandedness = sprintf("%.2f", $var_strandedness);
+}
+if($var_count && ($var_plus + $var_minus)) {
+## We must obtain variant read counts to proceed ##
+my $var_freq = $var_count / ($ref_count + $var_count);
+## FAILURE 1: READ POSITION ##
+if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) {
+#$stats{'num_fail_pos'}++;
+push @filter_names, $filters{'position'}->[0];
+}
+## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
+if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
+#$stats{'num_fail_strand'}++;
+push @filter_names, $filters{'strand_bias'}->[0];
+}
+## FAILURE : Variant allele count does not meet minimum ##
+if($var_count < $min_var_count) {
+#$stats{'num_fail_varcount'}++;
+push @filter_names, $filters{'min_var_count'}->[0];
+}
+## FAILURE : Variant allele frequency does not meet minimum ##
+if($var_freq < $min_var_freq) {
+#$stats{'num_fail_varfreq'}++;
+push @filter_names, $filters{'min_var_freq'}->[0];
+}
+## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
+if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
+#$stats{'num_fail_mmqs'}++;
+push @filter_names, $filters{'mmqs_diff'}->[0];
+}
+## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
+if($mapqual_diff > $max_mapqual_diff) {
+#$stats{'num_fail_mapqual'}++;
+push @filter_names, $filters{'mq_diff'}->[0];
+}
+## FAILURE 5: Read length difference exceeds allowable maximum ##
+if($readlen_diff > $max_readlen_diff) {
+#$stats{'num_fail_readlen'}++;
+push @filter_names, $filters{'read_length'}->[0];
+}
+## FAILURE 5: Read length difference exceeds allowable maximum ##
+if($var_dist_3 < $min_var_dist_3) {
+#$stats{'num_fail_dist3'}++;
+push @filter_names, $filters{'dist3'}->[0];
+}
+if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
+#$stats{'num_fail_var_mmqs'}++;
+push @filter_names, $filters{'var_mmqs'}->[0];
+}
+## SUCCESS: Pass Filter ##
+if(@filter_names == 0) {
+#$stats{'num_pass_filter'}++;
+## Print output, and append strandedness information ##
+@filter_names = ('PASS');
+}
+}
+else {
+push @filter_names, $filters{'no_var_readcount'}->[0];
+}
+}
+else {
+#$stats{'num_no_readcounts'}++;
+#print $fail_fh "$line\tno_readcounts\n";
+push @filter_names, $filters{'incomplete_readcount'}->[0];
+}
+return @filter_names;
+}
+sub add_filters_to_vcf_header {
+my ($parsed_header, @filter_refs) = @_;
+my $column_header = pop @$parsed_header;
+for my $filter_ref (@filter_refs) {
+my ($filter_name, $filter_description) = @$filter_ref;
+my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
+push @$parsed_header, $filter_line;
+}
+push @$parsed_header, $column_header;
+}
+sub add_process_log_to_header {
+my ($parsed_header, $input, @params) = @_;
+my $column_header = pop @$parsed_header;
+my $param_string = join(" ", @params);
+push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
+}
+sub filter_sites_in_hash {
+my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
+#done parsing vcf
+$optional_param ||= '';
+my $list_name = File::Spec->catfile($working_dir, "regions.txt");
+my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
+generate_region_list($hash, $list_fh);
+$list_fh->close();
+## run bam-readcount
+my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
+my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
+while(my $rc_line = $rc_results->getline) {
+chomp $rc_line;
+my ($chrom, $position) = split(/\t/, $rc_line);
+if($hash->{$chrom}{$position}) {
+for my $ref (keys %{$hash->{$chrom}{$position}}) {
+for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
+my $ref_result = read_counts_by_allele($rc_line, $ref);
+my $var_result = read_counts_by_allele($rc_line, $var);
+my @filters = filter_site($ref_result, $var_result);
+my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
+my @fields = split "\t", $$vcf_line_ref;
+if($fields[6] eq '.' || $fields[6] eq 'PASS') {
+$fields[6] = join(";", @filters);
+}
+else {
+$fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
+}
+$$vcf_line_ref = join("\t", @fields);
+}
+}
+}
+else {
+die "Unknown site for rc\n";
+}
+}
+unless($rc_results->close) {
+die "Error running bam-readcount\n";
+}
+}
+sub setup_workdir {
+my ($reference, $bam_file, $bam_index) = @_;
+$reference = abs_path($reference);
+$bam_file = abs_path($bam_file);
+$bam_index = abs_path($bam_index) if $bam_index;
+my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or
+die "Unable to create working directory\n";
+#symlink in necessary files to run
+my $working_reference =  File::Spec->catfile($dir, "reference.fa");
+symlink $reference, $working_reference;
+my $fa_index = $reference . ".fai";
+unless(-e $fa_index) {
+index_fasta($working_reference);
+}
+else {
+symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
+}
+my $working_bam = File::Spec->catfile($dir, "tumor.bam");
+my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
+symlink $bam_file, $working_bam;
+if($bam_index) {
+symlink $bam_index, $working_bam_index;
+}
+elsif(-e $bam_file . ".bai") {
+symlink $bam_file . ".bai", $working_bam_index;
+}
+else {
+index_bam($working_bam);
+}
+return ($dir, $working_reference, $working_bam);
+}
+sub index_fasta {
+my ($fasta) = @_;
+print STDERR "Indexing fasta...\n";
+my @args = ($samtools_path, "faidx", $fasta);
+system(@args) == 0
+or die "Unable to index $fasta: $?\n";
+}
+sub index_bam {
+my ($bam) = @_;
+print STDERR "Indexing BAM...\n";
+my @args = ($samtools_path, "index", $bam);
+system(@args) == 0
+or die "Unable to index $bam: $?\n";
+}

Mercurial > repos > elixir-it > fpfilter

comparison fpfilter.pl @ 0:276875076be1 draft