Mercurial > repos > iuc > decontam
comparison test-data/decontam_Rscript.Rmd @ 0:86da5c894956 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam commit 9e0ce6d02ee71d5b974ef615b1c5286bc45d8e6b
| author | iuc |
|---|---|
| date | Tue, 15 Oct 2024 13:36:46 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:86da5c894956 |
|---|---|
| 1 --- | |
| 2 title: "decontam_docs" | |
| 3 output: html_document | |
| 4 date: "2024-09-10" | |
| 5 --- | |
| 6 | |
| 7 # This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile | |
| 8 | |
| 9 ```{r setup, include=FALSE} | |
| 10 knitr::opts_chunk$set(echo = TRUE) | |
| 11 ``` | |
| 12 | |
| 13 ## Install test env and run studio in this env | |
| 14 | |
| 15 ```{bash install} | |
| 16 mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse | |
| 17 mamba activate decontam | |
| 18 rstudio | |
| 19 ``` | |
| 20 | |
| 21 ### Check correct R home | |
| 22 | |
| 23 ```{r home} | |
| 24 R.home() | |
| 25 ``` | |
| 26 | |
| 27 ## Get test data | |
| 28 | |
| 29 Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input. | |
| 30 | |
| 31 ```{r store test data} | |
| 32 R.home() | |
| 33 library(phyloseq) | |
| 34 packageVersion("phyloseq") | |
| 35 library(ggplot2) | |
| 36 packageVersion("ggplot2") | |
| 37 library(decontam) | |
| 38 packageVersion("decontam") | |
| 39 | |
| 40 ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam")) | |
| 41 | |
| 42 # Sample from a physeq object with a sampling function. | |
| 43 # ps: physeq object to be sampled | |
| 44 # fun: function to use for sampling (default `sample`) | |
| 45 # ...: parameters to be passed to fun, | |
| 46 # see `help(sample)` for default parameters | |
| 47 sample_ps <- function(ps, fun = sample, ...) { | |
| 48 ids <- sample_names(ps) | |
| 49 sampled_ids <- fun(ids, ...) | |
| 50 ps <- prune_samples(sampled_ids, ps) | |
| 51 return(ps) | |
| 52 } | |
| 53 | |
| 54 # the initial object is to big for the test case so we subsample | |
| 55 ps <- sample_ps(ps, size = 200) | |
| 56 | |
| 57 ## ps | |
| 58 # get otu table | |
| 59 otu <- as(otu_table(ps), "matrix") | |
| 60 head(otu[, 1:10], 10) | |
| 61 | |
| 62 # add control column to sample data | |
| 63 sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample" | |
| 64 # store as 0 and 1 | |
| 65 sample_data(ps)$control <- as.integer(sample_data(ps)$control) | |
| 66 head(sample_data(ps), 1000) | |
| 67 | |
| 68 metadata <- as(sample_data(ps), "matrix") | |
| 69 head(metadata, 1000) | |
| 70 | |
| 71 # store test data | |
| 72 # stores the row names as column, | |
| 73 # see https://stackoverflow.com/questions/2478352 | |
| 74 # /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames | |
| 75 write.table(data.frame("SampleID" = rownames(otu), otu), | |
| 76 file = file.path(getwd(), "otu_input.tsv"), | |
| 77 sep = "\t", | |
| 78 row.names = FALSE, | |
| 79 quote = FALSE | |
| 80 ) | |
| 81 write.table(data.frame("SampleID" = rownames(metadata), metadata), | |
| 82 file = file.path(getwd(), "metadata_input.tsv"), | |
| 83 sep = "\t", | |
| 84 row.names = FALSE, | |
| 85 quote = FALSE | |
| 86 ) | |
| 87 | |
| 88 saveRDS(ps, file.path(getwd(), "phyloseq_input.rds")) | |
| 89 ``` | |
| 90 | |
| 91 ## Load test data | |
| 92 | |
| 93 ```{r load test data} | |
| 94 library(tidyverse) | |
| 95 | |
| 96 # get OTU table (first column is the OTU/ASV ID) | |
| 97 otu <- read_tsv(file.path(getwd(), "otu_input.tsv")) | |
| 98 # use first column as colname | |
| 99 otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1]) | |
| 100 otu <- otu_table(otu2, taxa_are_rows = FALSE) | |
| 101 | |
| 102 # get metadata table must have matching OTU/ASV ID in first column | |
| 103 meta <- read_tsv(file.path(getwd(), "metadata_input.tsv")) | |
| 104 # use first column as colname | |
| 105 | |
| 106 meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1]) | |
| 107 control_column <- "control" | |
| 108 | |
| 109 # convert 0/1 to bool for the control column and store in control column | |
| 110 meta2$control <- as.logical(meta2[[control_column]]) | |
| 111 sampledata <- sample_data(meta2) | |
| 112 | |
| 113 # create dummy tax table (actually not needed, | |
| 114 # but nice to learn how to load phyloseq objects) | |
| 115 taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE), | |
| 116 nrow = ncol(otu2), ncol = 7 | |
| 117 )) | |
| 118 rownames(taxmat) <- colnames(otu2) | |
| 119 tax <- tax_table(as.matrix(taxmat)) | |
| 120 | |
| 121 ps <- phyloseq(otu, tax, sampledata) | |
| 122 ``` | |
| 123 | |
| 124 # plot 1 | |
| 125 | |
| 126 ```{r plot library size vs control} | |
| 127 # Put sample_data into a ggplot-friendly data.frame | |
| 128 df <- as.data.frame(sample_data(ps)) | |
| 129 df$LibrarySize <- sample_sums(ps) | |
| 130 df <- df[order(df$LibrarySize), ] | |
| 131 df$Index <- seq_len(nrow(df)) | |
| 132 ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) + | |
| 133 geom_point() | |
| 134 ``` | |
| 135 | |
| 136 # plot 2 | |
| 137 | |
| 138 ```{r plot prevalence} | |
| 139 contamdf_prev <- isContaminant(ps, | |
| 140 method = "prevalence", | |
| 141 neg = "control", | |
| 142 threshold = 0.5 | |
| 143 ) | |
| 144 table(contamdf_prev$contaminant) | |
| 145 | |
| 146 ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0)) | |
| 147 ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa) | |
| 148 ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa) | |
| 149 # Make data_frame of prevalence in positive and negative samples | |
| 150 df_pa <- data.frame( | |
| 151 pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg), | |
| 152 contaminant = contamdf_prev$contaminant | |
| 153 ) | |
| 154 ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) + | |
| 155 geom_point() + | |
| 156 xlab("Prevalence (Negative Controls)") + | |
| 157 ylab("Prevalence (True Samples)") | |
| 158 ``` | |
| 159 | |
| 160 # generate output | |
| 161 | |
| 162 ```{r remove contams} | |
| 163 id_name <- colnames(otu)[1] | |
| 164 | |
| 165 ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps) | |
| 166 | |
| 167 otu_table(ps_noncontam) %>% | |
| 168 as.data.frame() %>% | |
| 169 rownames_to_column(id_name) <- otu | |
| 170 | |
| 171 write.table(otu, | |
| 172 file = file.path(getwd(), "otu_output.tsv"), | |
| 173 sep = "\t", | |
| 174 row.names = FALSE, | |
| 175 ) | |
| 176 ``` |
