Mercurial > repos > iuc > snpfreqplot
comparison snpEffExtract.R @ 0:469cc3aadaa7 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
| author | iuc |
|---|---|
| date | Wed, 02 Dec 2020 21:22:37 +0000 |
| parents | |
| children | 59a7bb93b5fe |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:469cc3aadaa7 |
|---|---|
| 1 #!/usr/bin/env R | |
| 2 | |
| 3 suppressPackageStartupMessages(library(VariantAnnotation)) | |
| 4 suppressPackageStartupMessages(library(tidyverse)) | |
| 5 | |
| 6 tsv_eff_from_vcf <- function(input_vcf, output_tab) { | |
| 7 read_vcf <- readVcf(input_vcf) # nolint | |
| 8 chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")] | |
| 9 ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")] | |
| 10 dp_af <- read_vcf@info[c("DP", "AF")] | |
| 11 | |
| 12 ## Unwrap the DNAStringList | |
| 13 # nolint start | |
| 14 ref_alt_filter <- data.frame( | |
| 15 REF = as.character(ref_alt_filter$REF), | |
| 16 ALT = sapply(seq_len(nrow(ref_alt_filter)), function(i) { | |
| 17 as.character(ref_alt_filter$ALT[[i]]) | |
| 18 }), | |
| 19 FILTER = as.character(ref_alt_filter$FILTER)) | |
| 20 # nolint end | |
| 21 ## | |
| 22 ## Don't unwrap EFF yet, we need to preserve rows | |
| 23 eff <- read_vcf@info["EFF"] | |
| 24 | |
| 25 stopifnot(nrow(chrom_pos) == nrow(ref_alt_filter)) | |
| 26 stopifnot(nrow(ref_alt_filter) == nrow(dp_af)) | |
| 27 stopifnot(nrow(dp_af) == nrow(eff)) | |
| 28 | |
| 29 ## EFF data contains nested constructs we need to unify all | |
| 30 ## data sources first, and then explode the EFF column. | |
| 31 united <- as_tibble(cbind(chrom_pos, ref_alt_filter, dp_af, eff)) # nolint | |
| 32 united_exploderows <- unnest(united, cols = c(EFF)) # nolint | |
| 33 | |
| 34 united_exploderows <- united_exploderows %>% | |
| 35 dplyr::mutate(CHROM = seqnames, POS = start) %>% | |
| 36 dplyr::select(CHROM, POS, REF, ALT, FILTER, DP, AF, EFF) | |
| 37 | |
| 38 ## EFF columns are defined here: | |
| 39 ## https://pcingola.github.io/SnpEff/se_inputoutput/ | |
| 40 options(warn = -1) ## suppress warnings | |
| 41 seperated_info <- united_exploderows %>% | |
| 42 separate(EFF, sep = "[(|)]", | |
| 43 extra = "merge", ## extra values merged into "extra" column | |
| 44 into = c("EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", | |
| 45 "codon.change", "EFF[*].AA", "AA.length", | |
| 46 "EFF[*].GENE", "trans.biotype", "gene.coding", | |
| 47 "trans.id", "exon.rank", "gt.num", "warnings", | |
| 48 "extra")) | |
| 49 options(warn = 0) | |
| 50 ## If there is data that has been dropped or filled-in, we will see it in | |
| 51 ## the "extra" column if it isn't NA or an empty quote. | |
| 52 test_missing <- seperated_info %>% | |
| 53 dplyr::select("CHROM", "POS", "extra") %>% | |
| 54 replace_na(list(extra = "")) %>% | |
| 55 filter(extra != "") | |
| 56 | |
| 57 if (nrow(test_missing) > 0) { | |
| 58 print(test_missing) | |
| 59 stop("Extra values were not parsed") | |
| 60 } | |
| 61 | |
| 62 vcf_info <- seperated_info %>% | |
| 63 dplyr::select("CHROM", "POS", "REF", "ALT", "FILTER", "DP", "AF", | |
| 64 "EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", | |
| 65 "EFF[*].AA", "EFF[*].GENE") %>% | |
| 66 ## now we de-duplicate any rows that arise from subselecting columns | |
| 67 dplyr::distinct() | |
| 68 | |
| 69 ## At this point, we would still have rows which share a POS and ALT pair | |
| 70 ## which could be problematic for the heatmap plot later. | |
| 71 ## | |
| 72 ## This is not something to worry about here, and is resolved in the heatmap | |
| 73 ## script later. | |
| 74 write.table(vcf_info, file = output_tab, | |
| 75 quote = F, sep = "\t", row.names = F) | |
| 76 } | |
| 77 | |
| 78 | |
| 79 # M A I N | |
| 80 stopifnot(exists("samples")) | |
| 81 | |
| 82 for (i in seq_len(nrow(samples))) { | |
| 83 entry <- samples[i, ]; | |
| 84 if (entry$exts %in% c("vcf", "vcf.gz")) { | |
| 85 in_vcf <- entry$files | |
| 86 out_tsv <- paste0(entry$ids, ".tsv") ## use local dir | |
| 87 tsv_eff_from_vcf(in_vcf, out_tsv) | |
| 88 ## point to the new file | |
| 89 samples[i, ]$files <- out_tsv | |
| 90 message(paste(entry$ids, ": converted from VCF to tabular")) | |
| 91 } else { | |
| 92 message(paste(entry$ids, ": already tabular")) | |
| 93 } | |
| 94 } |
