comparison fpfilter.pl @ 0:276875076be1 draft

Uploaded
author elixir-it
date Tue, 03 Jul 2018 06:05:44 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:276875076be1
1 #!/usr/bin/env perl
2
3 use warnings;
4 use strict;
5
6 use Getopt::Long;
7 use IO::File;
8 use File::Temp qw( tempdir );
9 use File::Spec;
10 use Cwd qw( abs_path cwd);
11
12 ## Define filtering parameters ##
13
14 my $min_read_pos = 0.10;
15 my $max_read_pos = 1 - $min_read_pos;
16 my $min_var_freq = 0.05;
17 my $min_var_count = 4;
18
19 my $min_strandedness = 0.01;
20 my $max_strandedness = 1 - $min_strandedness;
21
22 my $max_mm_qualsum_diff = 50;
23 my $max_mapqual_diff = 30;
24 my $max_readlen_diff = 25;
25 my $min_var_dist_3 = 0.20;
26 my $max_var_mm_qualsum = 100;
27
28
29 ## Parse arguments ##
30
31 my $output_basename;
32
33 my $vcf_file;
34 my $output_file;
35 my $bam_file;
36 my $bam_index;
37 my $sample;
38 my $bam_readcount_path = 'bam-readcount';
39 my $samtools_path = 'samtools';
40 my $ref_fasta;
41 my $help;
42
43
44 my $opt_result;
45 my @params = @ARGV;
46
47 $opt_result = GetOptions(
48 'vcf-file=s' => \$vcf_file,
49 'bam-file=s' => \$bam_file,
50 'bam-index=s' => \$bam_index,
51 'sample=s' => \$sample,
52 'bam-readcount=s' => \$bam_readcount_path,
53 'samtools=s' => \$samtools_path,
54 'reference=s' => \$ref_fasta,
55 'output=s' => \$output_file,
56 'min-read-pos=f' => \$min_read_pos,
57 'min-var-freq=f' => \$min_var_freq,
58 'min-var-count=f' => \$min_var_count,
59 'min-strandedness=f' => \$min_strandedness,
60 'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
61 'max-mapqual-diff=f' => \$max_mapqual_diff,
62 'max-readlen-diff=f' => \$max_readlen_diff,
63 'min-var-dist-3=f' => \$min_var_dist_3,
64 'max-var-mm-qualsum=f' => \$max_var_mm_qualsum,
65 'help' => \$help,
66 );
67
68 unless($opt_result) {
69 die help_text();
70 }
71
72 if($help) {
73 print STDOUT help_text();
74 exit 0;
75 }
76
77 unless($vcf_file) {
78 warn "You must provide a file to be filtered\n";
79 die help_text();
80 }
81
82 unless(-s $vcf_file) {
83 die "Can not find VCF file: $vcf_file\n";
84 }
85
86 unless($ref_fasta) {
87 warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
88 die help_text();
89 }
90
91 unless(-s $ref_fasta) {
92 die "Can not find valid reference fasta: $ref_fasta\n";
93 }
94
95 unless($bam_file) {
96 die "You must provide a BAM file for generating readcounts\n";
97 die help_text();
98 }
99
100 unless(-s $bam_file) {
101 die "Can not find valid BAM file: $bam_file\n";
102 }
103
104 if($bam_index && !-s $bam_index) {
105 die "Can not find valid BAM index file: $bam_index\n";
106 }
107
108 unless($output_file) {
109 warn "You must provide an output file name: $output_file\n";
110 die help_text();
111 }
112 else {
113 $output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
114 }
115
116 unless($sample) {
117 warn "You must provide a sample name\n";
118 die help_text();
119 }
120
121 my %filters;
122 $filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"];
123 $filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
124 $filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
125 $filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
126 $filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
127 $filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
128 $filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
129 $filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
130 $filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
131 $filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
132 $filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
133
134 my $vcf_header;
135 my @vcf_lines;
136
137 my %rc_for_snp; # store info on snp positions and VCF line
138 my %rc_for_indel; #store info on indel positions and VCF line
139
140 my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
141
142 my $starting_dir = cwd();
143
144 my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
145 $vcf_header = parse_vcf_header($input);
146 add_filters_to_vcf_header($vcf_header, values %filters);
147 add_process_log_to_header($vcf_header, $vcf_file, @params);
148
149 my $header_line = $vcf_header->[-1];
150 chomp $header_line;
151 my @header_fields = split "\t", $header_line;
152
153 while(my $entry = $input->getline) {
154 push @vcf_lines, $entry;
155 chomp $entry;
156
157 my %fields;
158 @fields{@header_fields} = split("\t", $entry);
159
160 my $filter_sample = $fields{$sample};
161 unless($filter_sample) {
162 die "Unable to find field for $sample\n";
163 }
164 my @sample_fields = split /:/, $filter_sample;
165 unless(@sample_fields) {
166 die "Unable to parse field for $sample\n";
167 }
168 my $index = 0;
169 my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
170 #these are in order ACGT
171 my @alleles = ($fields{REF}, split /,/, $fields{ALT});
172 my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
173 my @used_alleles;
174 for my $allele_index (keys %gt_alleles) {
175 push @used_alleles, $alleles[$allele_index];
176 }
177 my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
178 $var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
179 $var = uc($var);
180 my $ref = uc($fields{REF});
181 my $chrom = $fields{'#CHROM'};
182 my $pos = $fields{POS};
183
184 if(length($ref) > 1 || length($var) > 1) {
185 #it's an indel or mnp
186 if(length($ref) == length($var)) {
187 die "MNPs unsupported\n";
188 }
189 elsif(length($ref) > length($var)) {
190 #it's a deletion
191 $pos += 1;
192 ($ref, $var) = ($var, $ref);
193 $ref = substr($var, 1, 1);
194 $var = "-" . substr($var, 1);
195 }
196 else {
197 #it's an insertion
198 substr($var, 0, 1, "+");
199 }
200 $rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
201 }
202 else {
203 #it's a SNP
204 $rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
205 }
206 }
207
208 if(%rc_for_snp) {
209 filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
210 }
211 else {
212 print STDERR "No SNP sites identified\n";
213 }
214
215 if(%rc_for_indel) {
216 filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
217 }
218 else {
219 print STDERR "No Indel sites identified\n";
220 }
221
222 ## Open the output files ##
223 my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
224 print $filtered_vcf @$vcf_header;
225 print $filtered_vcf @vcf_lines;
226
227 chdir $starting_dir or die "Unable to go back to starting dir\n";
228 exit(0);
229
230 ################################################################################
231
232 =head3 read_counts_by_allele
233
234 Retrieve relevant read counts for a certain allele
235
236
237 =cut
238
239 sub read_counts_by_allele {
240 (my $line, my $allele) = @_;
241
242 my @lineContents = split(/\t/, $line);
243 my $numContents = @lineContents;
244
245 for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
246 my $this_allele = $lineContents[$colCounter];
247 my @alleleContents = split(/\:/, $this_allele);
248 if($alleleContents[0] eq $allele) {
249 my $numAlleleContents = @alleleContents;
250
251 return("") if($numAlleleContents < 8);
252
253 my $return_string = "";
254 my $return_sum = 0;
255 for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
256 $return_sum += $alleleContents[$printCounter];
257 $return_string .= "\t" if($return_string);
258 $return_string .= $alleleContents[$printCounter];
259 }
260
261 return($return_string);
262
263 }
264 }
265
266 return("");
267 }
268
269
270 sub help_text {
271 return <<HELP;
272 fpfilter - Filtering for Illumina Sequencing
273
274 SYNOPSIS
275 fpfilter [options] [file ...]
276
277 OPTIONS
278 --vcf-file the input VCF file. Must have a GT field.
279 --bam-file the BAM file of the sample you are filtering on. Typically the tumor.
280 --sample the sample name of the sample you want to filter on in the VCF file.
281 --reference a fasta containing the reference sequence the BAM file was aligned to.
282 --output the filename of the output VCF file
283 --min-read-pos minimum average relative distance from start/end of read
284 --min-var-freq minimum variant allele frequency
285 --min-var-count minimum number of variant-supporting reads
286 --min-strandedness minimum representation of variant allele on each strand
287 --max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
288 --max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads
289 --max-mapqual-diff maximum difference of mapping quality between variant and reference reads
290 --max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter)
291 --min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
292 --help this message
293
294 DESCRIPTION
295 This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount).
296
297 This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
298
299 AUTHORS
300 Dan Koboldt Original code
301 Dave Larson Modifications for VCF and exportation.
302
303 HELP
304 }
305
306 ### methods copied from elsewhere begin here...
307 sub parse_vcf_header {
308 my $input_fh = shift;
309
310 my @header;
311 my $header_end = 0;
312 while (!$header_end) {
313 my $line = $input_fh->getline;
314 if ($line =~ m/^##/) {
315 push @header, $line;
316 } elsif ($line =~ m/^#/) {
317 push @header, $line;
318 $header_end = 1;
319 } else {
320 die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
321 }
322 }
323 return \@header;
324 }
325
326 sub generate_region_list {
327 my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
328 print STDERR "Printing variants to temporary region_list file...\n";
329 for my $chr (keys %$hash) {
330 for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
331 print $region_fh "$chr\t$pos\t$pos\n";
332 }
333 }
334 }
335
336 sub _simplify_indel_allele {
337 my ($ref, $var) = @_;
338 #these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
339 #they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
340 #they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
341 #
342 #I think the algorithm should be trim end (no updating of coords required)
343 #trim beginning and return number of bases trimmed
344
345 my @ref_array = map { uc } split //, $ref;
346 my @var_array = map { uc } split //, $var;
347
348 while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
349 pop @ref_array;
350 pop @var_array;
351 }
352
353 my $right_shift = 0;
354 while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
355 shift @ref_array;
356 shift @var_array;
357 $right_shift++;
358 }
359
360 return (join("",@ref_array), join("",@var_array), $right_shift);
361 }
362
363 sub filter_site {
364 my ($ref_result, $var_result) = @_;
365 #this will return a list of filter names
366 my @filter_names;
367 if($ref_result && $var_result) {
368 ## Parse out the bam-readcounts details for each allele. The fields should be: ##
369 #num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
370 my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
371 my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
372
373 my $ref_strandedness = my $var_strandedness = 0.50;
374 $ref_dist_3 = 0.5 if(!$ref_dist_3);
375
376 ## Use conservative defaults if we can't get mismatch quality sums ##
377 $ref_mmqs = 50 if(!$ref_mmqs);
378 $var_mmqs = 0 if(!$var_mmqs);
379 my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
380
381 ## Determine map qual diff ##
382
383 my $mapqual_diff = $ref_map_qual - $var_map_qual;
384
385
386 ## Determine difference in average supporting read length ##
387
388 my $readlen_diff = $ref_avg_rl - $var_avg_rl;
389
390
391 ## Determine ref strandedness ##
392
393 if(($ref_plus + $ref_minus) > 0) {
394 $ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
395 $ref_strandedness = sprintf("%.2f", $ref_strandedness);
396 }
397
398 ## Determine var strandedness ##
399
400 if(($var_plus + $var_minus) > 0) {
401 $var_strandedness = $var_plus / ($var_plus + $var_minus);
402 $var_strandedness = sprintf("%.2f", $var_strandedness);
403 }
404
405 if($var_count && ($var_plus + $var_minus)) {
406 ## We must obtain variant read counts to proceed ##
407
408 my $var_freq = $var_count / ($ref_count + $var_count);
409
410 ## FAILURE 1: READ POSITION ##
411 if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) {
412 #$stats{'num_fail_pos'}++;
413 push @filter_names, $filters{'position'}->[0];
414 }
415
416 ## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
417 if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
418 #$stats{'num_fail_strand'}++;
419 push @filter_names, $filters{'strand_bias'}->[0];
420 }
421
422 ## FAILURE : Variant allele count does not meet minimum ##
423 if($var_count < $min_var_count) {
424 #$stats{'num_fail_varcount'}++;
425 push @filter_names, $filters{'min_var_count'}->[0];
426 }
427
428 ## FAILURE : Variant allele frequency does not meet minimum ##
429 if($var_freq < $min_var_freq) {
430 #$stats{'num_fail_varfreq'}++;
431 push @filter_names, $filters{'min_var_freq'}->[0];
432 }
433
434 ## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
435 if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
436 #$stats{'num_fail_mmqs'}++;
437 push @filter_names, $filters{'mmqs_diff'}->[0];
438 }
439
440 ## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
441 if($mapqual_diff > $max_mapqual_diff) {
442 #$stats{'num_fail_mapqual'}++;
443 push @filter_names, $filters{'mq_diff'}->[0];
444 }
445
446 ## FAILURE 5: Read length difference exceeds allowable maximum ##
447 if($readlen_diff > $max_readlen_diff) {
448 #$stats{'num_fail_readlen'}++;
449 push @filter_names, $filters{'read_length'}->[0];
450 }
451
452 ## FAILURE 5: Read length difference exceeds allowable maximum ##
453 if($var_dist_3 < $min_var_dist_3) {
454 #$stats{'num_fail_dist3'}++;
455 push @filter_names, $filters{'dist3'}->[0];
456 }
457
458 if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
459 #$stats{'num_fail_var_mmqs'}++;
460 push @filter_names, $filters{'var_mmqs'}->[0];
461 }
462
463 ## SUCCESS: Pass Filter ##
464 if(@filter_names == 0) {
465 #$stats{'num_pass_filter'}++;
466 ## Print output, and append strandedness information ##
467 @filter_names = ('PASS');
468 }
469
470 }
471 else {
472 push @filter_names, $filters{'no_var_readcount'}->[0];
473 }
474 }
475 else {
476 #$stats{'num_no_readcounts'}++;
477 #print $fail_fh "$line\tno_readcounts\n";
478 push @filter_names, $filters{'incomplete_readcount'}->[0];
479 }
480 return @filter_names;
481 }
482
483 sub add_filters_to_vcf_header {
484 my ($parsed_header, @filter_refs) = @_;
485 my $column_header = pop @$parsed_header;
486 for my $filter_ref (@filter_refs) {
487 my ($filter_name, $filter_description) = @$filter_ref;
488 my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
489 push @$parsed_header, $filter_line;
490 }
491 push @$parsed_header, $column_header;
492 }
493
494 sub add_process_log_to_header {
495 my ($parsed_header, $input, @params) = @_;
496 my $column_header = pop @$parsed_header;
497 my $param_string = join(" ", @params);
498 push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
499 }
500
501 sub filter_sites_in_hash {
502 my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
503 #done parsing vcf
504 $optional_param ||= '';
505 my $list_name = File::Spec->catfile($working_dir, "regions.txt");
506 my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
507 generate_region_list($hash, $list_fh);
508 $list_fh->close();
509
510 ## run bam-readcount
511 my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
512 my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
513 while(my $rc_line = $rc_results->getline) {
514 chomp $rc_line;
515 my ($chrom, $position) = split(/\t/, $rc_line);
516 if($hash->{$chrom}{$position}) {
517 for my $ref (keys %{$hash->{$chrom}{$position}}) {
518 for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
519 my $ref_result = read_counts_by_allele($rc_line, $ref);
520 my $var_result = read_counts_by_allele($rc_line, $var);
521 my @filters = filter_site($ref_result, $var_result);
522
523 my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
524 my @fields = split "\t", $$vcf_line_ref;
525 if($fields[6] eq '.' || $fields[6] eq 'PASS') {
526 $fields[6] = join(";", @filters);
527 }
528 else {
529 $fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
530 }
531 $$vcf_line_ref = join("\t", @fields);
532 }
533 }
534 }
535 else {
536 die "Unknown site for rc\n";
537 }
538 }
539 unless($rc_results->close) {
540 die "Error running bam-readcount\n";
541 }
542 }
543
544 sub setup_workdir {
545 my ($reference, $bam_file, $bam_index) = @_;
546 $reference = abs_path($reference);
547 $bam_file = abs_path($bam_file);
548 $bam_index = abs_path($bam_index) if $bam_index;
549
550 my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or
551 die "Unable to create working directory\n";
552
553 #symlink in necessary files to run
554 my $working_reference = File::Spec->catfile($dir, "reference.fa");
555 symlink $reference, $working_reference;
556
557 my $fa_index = $reference . ".fai";
558 unless(-e $fa_index) {
559 index_fasta($working_reference);
560 }
561 else {
562 symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
563 }
564
565 my $working_bam = File::Spec->catfile($dir, "tumor.bam");
566 my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
567 symlink $bam_file, $working_bam;
568 if($bam_index) {
569 symlink $bam_index, $working_bam_index;
570 }
571 elsif(-e $bam_file . ".bai") {
572 symlink $bam_file . ".bai", $working_bam_index;
573 }
574 else {
575 index_bam($working_bam);
576 }
577 return ($dir, $working_reference, $working_bam);
578 }
579
580 sub index_fasta {
581 my ($fasta) = @_;
582
583 print STDERR "Indexing fasta...\n";
584 my @args = ($samtools_path, "faidx", $fasta);
585 system(@args) == 0
586 or die "Unable to index $fasta: $?\n";
587 }
588
589 sub index_bam {
590 my ($bam) = @_;
591
592 print STDERR "Indexing BAM...\n";
593 my @args = ($samtools_path, "index", $bam);
594 system(@args) == 0
595 or die "Unable to index $bam: $?\n";
596 }