comparison test-data/decontam_Rscript.Rmd @ 0:86da5c894956 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam commit 9e0ce6d02ee71d5b974ef615b1c5286bc45d8e6b
author iuc
date Tue, 15 Oct 2024 13:36:46 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:86da5c894956
1 ---
2 title: "decontam_docs"
3 output: html_document
4 date: "2024-09-10"
5 ---
6
7 # This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile
8
9 ```{r setup, include=FALSE}
10 knitr::opts_chunk$set(echo = TRUE)
11 ```
12
13 ## Install test env and run studio in this env
14
15 ```{bash install}
16 mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse
17 mamba activate decontam
18 rstudio
19 ```
20
21 ### Check correct R home
22
23 ```{r home}
24 R.home()
25 ```
26
27 ## Get test data
28
29 Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input.
30
31 ```{r store test data}
32 R.home()
33 library(phyloseq)
34 packageVersion("phyloseq")
35 library(ggplot2)
36 packageVersion("ggplot2")
37 library(decontam)
38 packageVersion("decontam")
39
40 ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam"))
41
42 # Sample from a physeq object with a sampling function.
43 # ps: physeq object to be sampled
44 # fun: function to use for sampling (default `sample`)
45 # ...: parameters to be passed to fun,
46 # see `help(sample)` for default parameters
47 sample_ps <- function(ps, fun = sample, ...) {
48 ids <- sample_names(ps)
49 sampled_ids <- fun(ids, ...)
50 ps <- prune_samples(sampled_ids, ps)
51 return(ps)
52 }
53
54 # the initial object is to big for the test case so we subsample
55 ps <- sample_ps(ps, size = 200)
56
57 ## ps
58 # get otu table
59 otu <- as(otu_table(ps), "matrix")
60 head(otu[, 1:10], 10)
61
62 # add control column to sample data
63 sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample"
64 # store as 0 and 1
65 sample_data(ps)$control <- as.integer(sample_data(ps)$control)
66 head(sample_data(ps), 1000)
67
68 metadata <- as(sample_data(ps), "matrix")
69 head(metadata, 1000)
70
71 # store test data
72 # stores the row names as column,
73 # see https://stackoverflow.com/questions/2478352
74 # /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames
75 write.table(data.frame("SampleID" = rownames(otu), otu),
76 file = file.path(getwd(), "otu_input.tsv"),
77 sep = "\t",
78 row.names = FALSE,
79 quote = FALSE
80 )
81 write.table(data.frame("SampleID" = rownames(metadata), metadata),
82 file = file.path(getwd(), "metadata_input.tsv"),
83 sep = "\t",
84 row.names = FALSE,
85 quote = FALSE
86 )
87
88 saveRDS(ps, file.path(getwd(), "phyloseq_input.rds"))
89 ```
90
91 ## Load test data
92
93 ```{r load test data}
94 library(tidyverse)
95
96 # get OTU table (first column is the OTU/ASV ID)
97 otu <- read_tsv(file.path(getwd(), "otu_input.tsv"))
98 # use first column as colname
99 otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1])
100 otu <- otu_table(otu2, taxa_are_rows = FALSE)
101
102 # get metadata table must have matching OTU/ASV ID in first column
103 meta <- read_tsv(file.path(getwd(), "metadata_input.tsv"))
104 # use first column as colname
105
106 meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1])
107 control_column <- "control"
108
109 # convert 0/1 to bool for the control column and store in control column
110 meta2$control <- as.logical(meta2[[control_column]])
111 sampledata <- sample_data(meta2)
112
113 # create dummy tax table (actually not needed,
114 # but nice to learn how to load phyloseq objects)
115 taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE),
116 nrow = ncol(otu2), ncol = 7
117 ))
118 rownames(taxmat) <- colnames(otu2)
119 tax <- tax_table(as.matrix(taxmat))
120
121 ps <- phyloseq(otu, tax, sampledata)
122 ```
123
124 # plot 1
125
126 ```{r plot library size vs control}
127 # Put sample_data into a ggplot-friendly data.frame
128 df <- as.data.frame(sample_data(ps))
129 df$LibrarySize <- sample_sums(ps)
130 df <- df[order(df$LibrarySize), ]
131 df$Index <- seq_len(nrow(df))
132 ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) +
133 geom_point()
134 ```
135
136 # plot 2
137
138 ```{r plot prevalence}
139 contamdf_prev <- isContaminant(ps,
140 method = "prevalence",
141 neg = "control",
142 threshold = 0.5
143 )
144 table(contamdf_prev$contaminant)
145
146 ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0))
147 ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa)
148 ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa)
149 # Make data_frame of prevalence in positive and negative samples
150 df_pa <- data.frame(
151 pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg),
152 contaminant = contamdf_prev$contaminant
153 )
154 ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) +
155 geom_point() +
156 xlab("Prevalence (Negative Controls)") +
157 ylab("Prevalence (True Samples)")
158 ```
159
160 # generate output
161
162 ```{r remove contams}
163 id_name <- colnames(otu)[1]
164
165 ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps)
166
167 otu_table(ps_noncontam) %>%
168 as.data.frame() %>%
169 rownames_to_column(id_name) <- otu
170
171 write.table(otu,
172 file = file.path(getwd(), "otu_output.tsv"),
173 sep = "\t",
174 row.names = FALSE,
175 )
176 ```