view test-data/decontam_Rscript.Rmd @ 3:871214ac722e draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam commit 8eca6c026e7bd7ab2e471485cb93b8dee1cd2c07
author iuc
date Sun, 09 Mar 2025 19:27:13 +0000
parents 86da5c894956
children
line wrap: on
line source

---
title: "decontam_docs"
output: html_document
date: "2024-09-10"
---

# This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Install test env and run studio in this env

```{bash install}
mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse
mamba activate decontam
rstudio
```

### Check correct R home

```{r home}
R.home()
```

## Get test data

Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input.

```{r store test data}
R.home()
library(phyloseq)
packageVersion("phyloseq")
library(ggplot2)
packageVersion("ggplot2")
library(decontam)
packageVersion("decontam")

ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam"))

# Sample from a physeq object with a sampling function.
#   ps: physeq object to be sampled
#   fun: function to use for sampling (default `sample`)
#   ...: parameters to be passed to fun,
# see `help(sample)` for default parameters
sample_ps <- function(ps, fun = sample, ...) {
    ids <- sample_names(ps)
    sampled_ids <- fun(ids, ...)
    ps <- prune_samples(sampled_ids, ps)
    return(ps)
}

# the initial object is to big for the test case so we subsample
ps <- sample_ps(ps, size = 200)

## ps
# get otu table
otu <- as(otu_table(ps), "matrix")
head(otu[, 1:10], 10)

# add control column to sample data
sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample"
# store as 0 and 1
sample_data(ps)$control <- as.integer(sample_data(ps)$control)
head(sample_data(ps), 1000)

metadata <- as(sample_data(ps), "matrix")
head(metadata, 1000)

# store test data
# stores the row names as column,
# see https://stackoverflow.com/questions/2478352
# /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames
write.table(data.frame("SampleID" = rownames(otu), otu),
    file = file.path(getwd(), "otu_input.tsv"),
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
)
write.table(data.frame("SampleID" = rownames(metadata), metadata),
    file = file.path(getwd(), "metadata_input.tsv"),
    sep = "\t",
    row.names = FALSE,
    quote = FALSE
)

saveRDS(ps, file.path(getwd(), "phyloseq_input.rds"))
```

## Load test data

```{r load test data}
library(tidyverse)

# get OTU table (first column is the OTU/ASV ID)
otu <- read_tsv(file.path(getwd(), "otu_input.tsv"))
# use first column as colname
otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1])
otu <- otu_table(otu2, taxa_are_rows = FALSE)

# get metadata table must have matching OTU/ASV ID in first column
meta <- read_tsv(file.path(getwd(), "metadata_input.tsv"))
# use first column as colname

meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1])
control_column <- "control"

# convert 0/1 to bool for the control column and store in control column
meta2$control <- as.logical(meta2[[control_column]])
sampledata <- sample_data(meta2)

# create dummy tax table (actually not needed,
# but nice to learn how to load phyloseq objects)
taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE),
    nrow = ncol(otu2), ncol = 7
))
rownames(taxmat) <- colnames(otu2)
tax <- tax_table(as.matrix(taxmat))

ps <- phyloseq(otu, tax, sampledata)
```

# plot 1

```{r plot library size vs control}
# Put sample_data into a ggplot-friendly data.frame
df <- as.data.frame(sample_data(ps))
df$LibrarySize <- sample_sums(ps)
df <- df[order(df$LibrarySize), ]
df$Index <- seq_len(nrow(df))
ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) +
    geom_point()
```

# plot 2

```{r plot prevalence}
contamdf_prev <- isContaminant(ps,
    method = "prevalence",
    neg = "control",
    threshold = 0.5
)
table(contamdf_prev$contaminant)

ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0))
ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa)
ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa)
# Make data_frame of prevalence in positive and negative samples
df_pa <- data.frame(
    pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg),
    contaminant = contamdf_prev$contaminant
)
ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) +
    geom_point() +
    xlab("Prevalence (Negative Controls)") +
    ylab("Prevalence (True Samples)")
```

# generate output

```{r remove contams}
id_name <- colnames(otu)[1]

ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps)

otu_table(ps_noncontam) %>%
    as.data.frame() %>%
    rownames_to_column(id_name) <- otu

write.table(otu,
    file = file.path(getwd(), "otu_output.tsv"),
    sep = "\t",
    row.names = FALSE,
)
```