Mercurial > repos > elixir-it > fpfilter
comparison fpfilter.pl @ 0:276875076be1 draft
Uploaded
| author | elixir-it |
|---|---|
| date | Tue, 03 Jul 2018 06:05:44 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:276875076be1 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use warnings; | |
| 4 use strict; | |
| 5 | |
| 6 use Getopt::Long; | |
| 7 use IO::File; | |
| 8 use File::Temp qw( tempdir ); | |
| 9 use File::Spec; | |
| 10 use Cwd qw( abs_path cwd); | |
| 11 | |
| 12 ## Define filtering parameters ## | |
| 13 | |
| 14 my $min_read_pos = 0.10; | |
| 15 my $max_read_pos = 1 - $min_read_pos; | |
| 16 my $min_var_freq = 0.05; | |
| 17 my $min_var_count = 4; | |
| 18 | |
| 19 my $min_strandedness = 0.01; | |
| 20 my $max_strandedness = 1 - $min_strandedness; | |
| 21 | |
| 22 my $max_mm_qualsum_diff = 50; | |
| 23 my $max_mapqual_diff = 30; | |
| 24 my $max_readlen_diff = 25; | |
| 25 my $min_var_dist_3 = 0.20; | |
| 26 my $max_var_mm_qualsum = 100; | |
| 27 | |
| 28 | |
| 29 ## Parse arguments ## | |
| 30 | |
| 31 my $output_basename; | |
| 32 | |
| 33 my $vcf_file; | |
| 34 my $output_file; | |
| 35 my $bam_file; | |
| 36 my $bam_index; | |
| 37 my $sample; | |
| 38 my $bam_readcount_path = 'bam-readcount'; | |
| 39 my $samtools_path = 'samtools'; | |
| 40 my $ref_fasta; | |
| 41 my $help; | |
| 42 | |
| 43 | |
| 44 my $opt_result; | |
| 45 my @params = @ARGV; | |
| 46 | |
| 47 $opt_result = GetOptions( | |
| 48 'vcf-file=s' => \$vcf_file, | |
| 49 'bam-file=s' => \$bam_file, | |
| 50 'bam-index=s' => \$bam_index, | |
| 51 'sample=s' => \$sample, | |
| 52 'bam-readcount=s' => \$bam_readcount_path, | |
| 53 'samtools=s' => \$samtools_path, | |
| 54 'reference=s' => \$ref_fasta, | |
| 55 'output=s' => \$output_file, | |
| 56 'min-read-pos=f' => \$min_read_pos, | |
| 57 'min-var-freq=f' => \$min_var_freq, | |
| 58 'min-var-count=f' => \$min_var_count, | |
| 59 'min-strandedness=f' => \$min_strandedness, | |
| 60 'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff, | |
| 61 'max-mapqual-diff=f' => \$max_mapqual_diff, | |
| 62 'max-readlen-diff=f' => \$max_readlen_diff, | |
| 63 'min-var-dist-3=f' => \$min_var_dist_3, | |
| 64 'max-var-mm-qualsum=f' => \$max_var_mm_qualsum, | |
| 65 'help' => \$help, | |
| 66 ); | |
| 67 | |
| 68 unless($opt_result) { | |
| 69 die help_text(); | |
| 70 } | |
| 71 | |
| 72 if($help) { | |
| 73 print STDOUT help_text(); | |
| 74 exit 0; | |
| 75 } | |
| 76 | |
| 77 unless($vcf_file) { | |
| 78 warn "You must provide a file to be filtered\n"; | |
| 79 die help_text(); | |
| 80 } | |
| 81 | |
| 82 unless(-s $vcf_file) { | |
| 83 die "Can not find VCF file: $vcf_file\n"; | |
| 84 } | |
| 85 | |
| 86 unless($ref_fasta) { | |
| 87 warn "You must provide a reference fasta for generating readcounts to use for filtering\n"; | |
| 88 die help_text(); | |
| 89 } | |
| 90 | |
| 91 unless(-s $ref_fasta) { | |
| 92 die "Can not find valid reference fasta: $ref_fasta\n"; | |
| 93 } | |
| 94 | |
| 95 unless($bam_file) { | |
| 96 die "You must provide a BAM file for generating readcounts\n"; | |
| 97 die help_text(); | |
| 98 } | |
| 99 | |
| 100 unless(-s $bam_file) { | |
| 101 die "Can not find valid BAM file: $bam_file\n"; | |
| 102 } | |
| 103 | |
| 104 if($bam_index && !-s $bam_index) { | |
| 105 die "Can not find valid BAM index file: $bam_index\n"; | |
| 106 } | |
| 107 | |
| 108 unless($output_file) { | |
| 109 warn "You must provide an output file name: $output_file\n"; | |
| 110 die help_text(); | |
| 111 } | |
| 112 else { | |
| 113 $output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below | |
| 114 } | |
| 115 | |
| 116 unless($sample) { | |
| 117 warn "You must provide a sample name\n"; | |
| 118 die help_text(); | |
| 119 } | |
| 120 | |
| 121 my %filters; | |
| 122 $filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " or greater than " . $max_read_pos . " fraction of the read length"]; | |
| 123 $filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"]; | |
| 124 $filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"]; | |
| 125 $filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq]; | |
| 126 $filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff]; | |
| 127 $filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff]; | |
| 128 $filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff]; | |
| 129 $filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3]; | |
| 130 $filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum); | |
| 131 $filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"]; | |
| 132 $filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"]; | |
| 133 | |
| 134 my $vcf_header; | |
| 135 my @vcf_lines; | |
| 136 | |
| 137 my %rc_for_snp; # store info on snp positions and VCF line | |
| 138 my %rc_for_indel; #store info on indel positions and VCF line | |
| 139 | |
| 140 my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index); | |
| 141 | |
| 142 my $starting_dir = cwd(); | |
| 143 | |
| 144 my $input = IO::File->new($vcf_file) or die "Unable to open input file $vcf_file: $!\n"; | |
| 145 $vcf_header = parse_vcf_header($input); | |
| 146 add_filters_to_vcf_header($vcf_header, values %filters); | |
| 147 add_process_log_to_header($vcf_header, $vcf_file, @params); | |
| 148 | |
| 149 my $header_line = $vcf_header->[-1]; | |
| 150 chomp $header_line; | |
| 151 my @header_fields = split "\t", $header_line; | |
| 152 | |
| 153 while(my $entry = $input->getline) { | |
| 154 push @vcf_lines, $entry; | |
| 155 chomp $entry; | |
| 156 | |
| 157 my %fields; | |
| 158 @fields{@header_fields} = split("\t", $entry); | |
| 159 | |
| 160 my $filter_sample = $fields{$sample}; | |
| 161 unless($filter_sample) { | |
| 162 die "Unable to find field for $sample\n"; | |
| 163 } | |
| 164 my @sample_fields = split /:/, $filter_sample; | |
| 165 unless(@sample_fields) { | |
| 166 die "Unable to parse field for $sample\n"; | |
| 167 } | |
| 168 my $index = 0; | |
| 169 my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT}; | |
| 170 #these are in order ACGT | |
| 171 my @alleles = ($fields{REF}, split /,/, $fields{ALT}); | |
| 172 my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT}; | |
| 173 my @used_alleles; | |
| 174 for my $allele_index (keys %gt_alleles) { | |
| 175 push @used_alleles, $alleles[$allele_index]; | |
| 176 } | |
| 177 my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites | |
| 178 $var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC | |
| 179 $var = uc($var); | |
| 180 my $ref = uc($fields{REF}); | |
| 181 my $chrom = $fields{'#CHROM'}; | |
| 182 my $pos = $fields{POS}; | |
| 183 | |
| 184 if(length($ref) > 1 || length($var) > 1) { | |
| 185 #it's an indel or mnp | |
| 186 if(length($ref) == length($var)) { | |
| 187 die "MNPs unsupported\n"; | |
| 188 } | |
| 189 elsif(length($ref) > length($var)) { | |
| 190 #it's a deletion | |
| 191 $pos += 1; | |
| 192 ($ref, $var) = ($var, $ref); | |
| 193 $ref = substr($var, 1, 1); | |
| 194 $var = "-" . substr($var, 1); | |
| 195 } | |
| 196 else { | |
| 197 #it's an insertion | |
| 198 substr($var, 0, 1, "+"); | |
| 199 } | |
| 200 $rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1]; | |
| 201 } | |
| 202 else { | |
| 203 #it's a SNP | |
| 204 $rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1]; | |
| 205 } | |
| 206 } | |
| 207 | |
| 208 if(%rc_for_snp) { | |
| 209 filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir); | |
| 210 } | |
| 211 else { | |
| 212 print STDERR "No SNP sites identified\n"; | |
| 213 } | |
| 214 | |
| 215 if(%rc_for_indel) { | |
| 216 filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i'); | |
| 217 } | |
| 218 else { | |
| 219 print STDERR "No Indel sites identified\n"; | |
| 220 } | |
| 221 | |
| 222 ## Open the output files ## | |
| 223 my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n"; | |
| 224 print $filtered_vcf @$vcf_header; | |
| 225 print $filtered_vcf @vcf_lines; | |
| 226 | |
| 227 chdir $starting_dir or die "Unable to go back to starting dir\n"; | |
| 228 exit(0); | |
| 229 | |
| 230 ################################################################################ | |
| 231 | |
| 232 =head3 read_counts_by_allele | |
| 233 | |
| 234 Retrieve relevant read counts for a certain allele | |
| 235 | |
| 236 | |
| 237 =cut | |
| 238 | |
| 239 sub read_counts_by_allele { | |
| 240 (my $line, my $allele) = @_; | |
| 241 | |
| 242 my @lineContents = split(/\t/, $line); | |
| 243 my $numContents = @lineContents; | |
| 244 | |
| 245 for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) { | |
| 246 my $this_allele = $lineContents[$colCounter]; | |
| 247 my @alleleContents = split(/\:/, $this_allele); | |
| 248 if($alleleContents[0] eq $allele) { | |
| 249 my $numAlleleContents = @alleleContents; | |
| 250 | |
| 251 return("") if($numAlleleContents < 8); | |
| 252 | |
| 253 my $return_string = ""; | |
| 254 my $return_sum = 0; | |
| 255 for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) { | |
| 256 $return_sum += $alleleContents[$printCounter]; | |
| 257 $return_string .= "\t" if($return_string); | |
| 258 $return_string .= $alleleContents[$printCounter]; | |
| 259 } | |
| 260 | |
| 261 return($return_string); | |
| 262 | |
| 263 } | |
| 264 } | |
| 265 | |
| 266 return(""); | |
| 267 } | |
| 268 | |
| 269 | |
| 270 sub help_text { | |
| 271 return <<HELP; | |
| 272 fpfilter - Filtering for Illumina Sequencing | |
| 273 | |
| 274 SYNOPSIS | |
| 275 fpfilter [options] [file ...] | |
| 276 | |
| 277 OPTIONS | |
| 278 --vcf-file the input VCF file. Must have a GT field. | |
| 279 --bam-file the BAM file of the sample you are filtering on. Typically the tumor. | |
| 280 --sample the sample name of the sample you want to filter on in the VCF file. | |
| 281 --reference a fasta containing the reference sequence the BAM file was aligned to. | |
| 282 --output the filename of the output VCF file | |
| 283 --min-read-pos minimum average relative distance from start/end of read | |
| 284 --min-var-freq minimum variant allele frequency | |
| 285 --min-var-count minimum number of variant-supporting reads | |
| 286 --min-strandedness minimum representation of variant allele on each strand | |
| 287 --max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter) | |
| 288 --max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads | |
| 289 --max-mapqual-diff maximum difference of mapping quality between variant and reference reads | |
| 290 --max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter) | |
| 291 --min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads | |
| 292 --help this message | |
| 293 | |
| 294 DESCRIPTION | |
| 295 This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount). | |
| 296 | |
| 297 This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads. | |
| 298 | |
| 299 AUTHORS | |
| 300 Dan Koboldt Original code | |
| 301 Dave Larson Modifications for VCF and exportation. | |
| 302 | |
| 303 HELP | |
| 304 } | |
| 305 | |
| 306 ### methods copied from elsewhere begin here... | |
| 307 sub parse_vcf_header { | |
| 308 my $input_fh = shift; | |
| 309 | |
| 310 my @header; | |
| 311 my $header_end = 0; | |
| 312 while (!$header_end) { | |
| 313 my $line = $input_fh->getline; | |
| 314 if ($line =~ m/^##/) { | |
| 315 push @header, $line; | |
| 316 } elsif ($line =~ m/^#/) { | |
| 317 push @header, $line; | |
| 318 $header_end = 1; | |
| 319 } else { | |
| 320 die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n"; | |
| 321 } | |
| 322 } | |
| 323 return \@header; | |
| 324 } | |
| 325 | |
| 326 sub generate_region_list { | |
| 327 my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF | |
| 328 print STDERR "Printing variants to temporary region_list file...\n"; | |
| 329 for my $chr (keys %$hash) { | |
| 330 for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) { | |
| 331 print $region_fh "$chr\t$pos\t$pos\n"; | |
| 332 } | |
| 333 } | |
| 334 } | |
| 335 | |
| 336 sub _simplify_indel_allele { | |
| 337 my ($ref, $var) = @_; | |
| 338 #these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion | |
| 339 #they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion | |
| 340 #they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn. | |
| 341 # | |
| 342 #I think the algorithm should be trim end (no updating of coords required) | |
| 343 #trim beginning and return number of bases trimmed | |
| 344 | |
| 345 my @ref_array = map { uc } split //, $ref; | |
| 346 my @var_array = map { uc } split //, $var; | |
| 347 | |
| 348 while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) { | |
| 349 pop @ref_array; | |
| 350 pop @var_array; | |
| 351 } | |
| 352 | |
| 353 my $right_shift = 0; | |
| 354 while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) { | |
| 355 shift @ref_array; | |
| 356 shift @var_array; | |
| 357 $right_shift++; | |
| 358 } | |
| 359 | |
| 360 return (join("",@ref_array), join("",@var_array), $right_shift); | |
| 361 } | |
| 362 | |
| 363 sub filter_site { | |
| 364 my ($ref_result, $var_result) = @_; | |
| 365 #this will return a list of filter names | |
| 366 my @filter_names; | |
| 367 if($ref_result && $var_result) { | |
| 368 ## Parse out the bam-readcounts details for each allele. The fields should be: ## | |
| 369 #num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist | |
| 370 my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result); | |
| 371 my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result); | |
| 372 | |
| 373 my $ref_strandedness = my $var_strandedness = 0.50; | |
| 374 $ref_dist_3 = 0.5 if(!$ref_dist_3); | |
| 375 | |
| 376 ## Use conservative defaults if we can't get mismatch quality sums ## | |
| 377 $ref_mmqs = 50 if(!$ref_mmqs); | |
| 378 $var_mmqs = 0 if(!$var_mmqs); | |
| 379 my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs; | |
| 380 | |
| 381 ## Determine map qual diff ## | |
| 382 | |
| 383 my $mapqual_diff = $ref_map_qual - $var_map_qual; | |
| 384 | |
| 385 | |
| 386 ## Determine difference in average supporting read length ## | |
| 387 | |
| 388 my $readlen_diff = $ref_avg_rl - $var_avg_rl; | |
| 389 | |
| 390 | |
| 391 ## Determine ref strandedness ## | |
| 392 | |
| 393 if(($ref_plus + $ref_minus) > 0) { | |
| 394 $ref_strandedness = $ref_plus / ($ref_plus + $ref_minus); | |
| 395 $ref_strandedness = sprintf("%.2f", $ref_strandedness); | |
| 396 } | |
| 397 | |
| 398 ## Determine var strandedness ## | |
| 399 | |
| 400 if(($var_plus + $var_minus) > 0) { | |
| 401 $var_strandedness = $var_plus / ($var_plus + $var_minus); | |
| 402 $var_strandedness = sprintf("%.2f", $var_strandedness); | |
| 403 } | |
| 404 | |
| 405 if($var_count && ($var_plus + $var_minus)) { | |
| 406 ## We must obtain variant read counts to proceed ## | |
| 407 | |
| 408 my $var_freq = $var_count / ($ref_count + $var_count); | |
| 409 | |
| 410 ## FAILURE 1: READ POSITION ## | |
| 411 if(($var_pos < $min_read_pos) || ($var_pos > $max_read_pos)) { | |
| 412 #$stats{'num_fail_pos'}++; | |
| 413 push @filter_names, $filters{'position'}->[0]; | |
| 414 } | |
| 415 | |
| 416 ## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ## | |
| 417 if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) { | |
| 418 #$stats{'num_fail_strand'}++; | |
| 419 push @filter_names, $filters{'strand_bias'}->[0]; | |
| 420 } | |
| 421 | |
| 422 ## FAILURE : Variant allele count does not meet minimum ## | |
| 423 if($var_count < $min_var_count) { | |
| 424 #$stats{'num_fail_varcount'}++; | |
| 425 push @filter_names, $filters{'min_var_count'}->[0]; | |
| 426 } | |
| 427 | |
| 428 ## FAILURE : Variant allele frequency does not meet minimum ## | |
| 429 if($var_freq < $min_var_freq) { | |
| 430 #$stats{'num_fail_varfreq'}++; | |
| 431 push @filter_names, $filters{'min_var_freq'}->[0]; | |
| 432 } | |
| 433 | |
| 434 ## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs | |
| 435 if($mismatch_qualsum_diff> $max_mm_qualsum_diff) { | |
| 436 #$stats{'num_fail_mmqs'}++; | |
| 437 push @filter_names, $filters{'mmqs_diff'}->[0]; | |
| 438 } | |
| 439 | |
| 440 ## FAILURE 4: Mapping quality difference exceeds allowable maximum ## | |
| 441 if($mapqual_diff > $max_mapqual_diff) { | |
| 442 #$stats{'num_fail_mapqual'}++; | |
| 443 push @filter_names, $filters{'mq_diff'}->[0]; | |
| 444 } | |
| 445 | |
| 446 ## FAILURE 5: Read length difference exceeds allowable maximum ## | |
| 447 if($readlen_diff > $max_readlen_diff) { | |
| 448 #$stats{'num_fail_readlen'}++; | |
| 449 push @filter_names, $filters{'read_length'}->[0]; | |
| 450 } | |
| 451 | |
| 452 ## FAILURE 5: Read length difference exceeds allowable maximum ## | |
| 453 if($var_dist_3 < $min_var_dist_3) { | |
| 454 #$stats{'num_fail_dist3'}++; | |
| 455 push @filter_names, $filters{'dist3'}->[0]; | |
| 456 } | |
| 457 | |
| 458 if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) { | |
| 459 #$stats{'num_fail_var_mmqs'}++; | |
| 460 push @filter_names, $filters{'var_mmqs'}->[0]; | |
| 461 } | |
| 462 | |
| 463 ## SUCCESS: Pass Filter ## | |
| 464 if(@filter_names == 0) { | |
| 465 #$stats{'num_pass_filter'}++; | |
| 466 ## Print output, and append strandedness information ## | |
| 467 @filter_names = ('PASS'); | |
| 468 } | |
| 469 | |
| 470 } | |
| 471 else { | |
| 472 push @filter_names, $filters{'no_var_readcount'}->[0]; | |
| 473 } | |
| 474 } | |
| 475 else { | |
| 476 #$stats{'num_no_readcounts'}++; | |
| 477 #print $fail_fh "$line\tno_readcounts\n"; | |
| 478 push @filter_names, $filters{'incomplete_readcount'}->[0]; | |
| 479 } | |
| 480 return @filter_names; | |
| 481 } | |
| 482 | |
| 483 sub add_filters_to_vcf_header { | |
| 484 my ($parsed_header, @filter_refs) = @_; | |
| 485 my $column_header = pop @$parsed_header; | |
| 486 for my $filter_ref (@filter_refs) { | |
| 487 my ($filter_name, $filter_description) = @$filter_ref; | |
| 488 my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n}; | |
| 489 push @$parsed_header, $filter_line; | |
| 490 } | |
| 491 push @$parsed_header, $column_header; | |
| 492 } | |
| 493 | |
| 494 sub add_process_log_to_header { | |
| 495 my ($parsed_header, $input, @params) = @_; | |
| 496 my $column_header = pop @$parsed_header; | |
| 497 my $param_string = join(" ", @params); | |
| 498 push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header; | |
| 499 } | |
| 500 | |
| 501 sub filter_sites_in_hash { | |
| 502 my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_; | |
| 503 #done parsing vcf | |
| 504 $optional_param ||= ''; | |
| 505 my $list_name = File::Spec->catfile($working_dir, "regions.txt"); | |
| 506 my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n"; | |
| 507 generate_region_list($hash, $list_fh); | |
| 508 $list_fh->close(); | |
| 509 | |
| 510 ## run bam-readcount | |
| 511 my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|"; | |
| 512 my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n"; | |
| 513 while(my $rc_line = $rc_results->getline) { | |
| 514 chomp $rc_line; | |
| 515 my ($chrom, $position) = split(/\t/, $rc_line); | |
| 516 if($hash->{$chrom}{$position}) { | |
| 517 for my $ref (keys %{$hash->{$chrom}{$position}}) { | |
| 518 for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) { | |
| 519 my $ref_result = read_counts_by_allele($rc_line, $ref); | |
| 520 my $var_result = read_counts_by_allele($rc_line, $var); | |
| 521 my @filters = filter_site($ref_result, $var_result); | |
| 522 | |
| 523 my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var}; | |
| 524 my @fields = split "\t", $$vcf_line_ref; | |
| 525 if($fields[6] eq '.' || $fields[6] eq 'PASS') { | |
| 526 $fields[6] = join(";", @filters); | |
| 527 } | |
| 528 else { | |
| 529 $fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS'); | |
| 530 } | |
| 531 $$vcf_line_ref = join("\t", @fields); | |
| 532 } | |
| 533 } | |
| 534 } | |
| 535 else { | |
| 536 die "Unknown site for rc\n"; | |
| 537 } | |
| 538 } | |
| 539 unless($rc_results->close) { | |
| 540 die "Error running bam-readcount\n"; | |
| 541 } | |
| 542 } | |
| 543 | |
| 544 sub setup_workdir { | |
| 545 my ($reference, $bam_file, $bam_index) = @_; | |
| 546 $reference = abs_path($reference); | |
| 547 $bam_file = abs_path($bam_file); | |
| 548 $bam_index = abs_path($bam_index) if $bam_index; | |
| 549 | |
| 550 my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or | |
| 551 die "Unable to create working directory\n"; | |
| 552 | |
| 553 #symlink in necessary files to run | |
| 554 my $working_reference = File::Spec->catfile($dir, "reference.fa"); | |
| 555 symlink $reference, $working_reference; | |
| 556 | |
| 557 my $fa_index = $reference . ".fai"; | |
| 558 unless(-e $fa_index) { | |
| 559 index_fasta($working_reference); | |
| 560 } | |
| 561 else { | |
| 562 symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai"); | |
| 563 } | |
| 564 | |
| 565 my $working_bam = File::Spec->catfile($dir, "tumor.bam"); | |
| 566 my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai"); | |
| 567 symlink $bam_file, $working_bam; | |
| 568 if($bam_index) { | |
| 569 symlink $bam_index, $working_bam_index; | |
| 570 } | |
| 571 elsif(-e $bam_file . ".bai") { | |
| 572 symlink $bam_file . ".bai", $working_bam_index; | |
| 573 } | |
| 574 else { | |
| 575 index_bam($working_bam); | |
| 576 } | |
| 577 return ($dir, $working_reference, $working_bam); | |
| 578 } | |
| 579 | |
| 580 sub index_fasta { | |
| 581 my ($fasta) = @_; | |
| 582 | |
| 583 print STDERR "Indexing fasta...\n"; | |
| 584 my @args = ($samtools_path, "faidx", $fasta); | |
| 585 system(@args) == 0 | |
| 586 or die "Unable to index $fasta: $?\n"; | |
| 587 } | |
| 588 | |
| 589 sub index_bam { | |
| 590 my ($bam) = @_; | |
| 591 | |
| 592 print STDERR "Indexing BAM...\n"; | |
| 593 my @args = ($samtools_path, "index", $bam); | |
| 594 system(@args) == 0 | |
| 595 or die "Unable to index $bam: $?\n"; | |
| 596 } |
