Mercurial > repos > iuc > dada2_primercheck
comparison test-data/gentest.R @ 0:40cd037434d9 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dada2 commit 3dd3145db6ed58efc3bf5f71e96515173967fc72
| author | iuc |
|---|---|
| date | Sat, 07 Dec 2024 08:41:16 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:40cd037434d9 |
|---|---|
| 1 library(dada2, quietly = TRUE) | |
| 2 library(ggplot2, quietly = TRUE) | |
| 3 | |
| 4 sample_names <- c("F3D0_S188_L001", "F3D141_S207_L001") | |
| 5 fwd <- c("F3D0_S188_L001_R1_001.fastq.gz", "F3D141_S207_L001_R1_001.fastq.gz") | |
| 6 rev <- c("F3D0_S188_L001_R2_001.fastq.gz", "F3D141_S207_L001_R2_001.fastq.gz") | |
| 7 | |
| 8 filt_fwd <- c("filterAndTrim_F3D0_R1.fq.gz", "filterAndTrim_F3D141_R1.fq.gz") | |
| 9 filt_rev <- c("filterAndTrim_F3D0_R2.fq.gz", "filterAndTrim_F3D141_R2.fq.gz") | |
| 10 | |
| 11 print("filterAndTrim") | |
| 12 | |
| 13 for (i in seq_len(fwd)) { | |
| 14 ftout <- dada2::filterAndTrim(fwd[i], filt_fwd[i], rev[i], filt_rev[i]) | |
| 15 b <- paste(strsplit(fwd[i], ".", fixed = TRUE)[[1]][1], "tab", sep = ".") | |
| 16 write.table(ftout, b, quote = FALSE, sep = "\t", col.names = NA) | |
| 17 } | |
| 18 | |
| 19 # In the test only the 1st data set is used | |
| 20 t <- data.frame() | |
| 21 t <- rbind(t, ftout[1, ]) | |
| 22 colnames(t) <- colnames(ftout) | |
| 23 rownames(t) <- rownames(ftout)[1] | |
| 24 write.table(t, "filterAndTrim.tab", quote = FALSE, sep = "\t", col.names = NA) | |
| 25 | |
| 26 names(fwd) <- sample_names | |
| 27 names(rev) <- sample_names | |
| 28 names(filt_fwd) <- sample_names | |
| 29 names(filt_rev) <- sample_names | |
| 30 | |
| 31 # Plot quality profile (just for one file, Galaxy compares with sim_size) | |
| 32 print("plots") | |
| 33 qp <- dada2::plotQualityProfile(fwd) | |
| 34 ggsave("qualityProfile_fwd.pdf", qp, width = 20, height = 15, units = c("cm")) | |
| 35 qp <- dada2::plotQualityProfile(rev) | |
| 36 ggsave("qualityProfile_rev.pdf", qp, width = 20, height = 15, units = c("cm")) | |
| 37 qp <- dada2::plotQualityProfile(fwd[1]) | |
| 38 ggsave("qualityProfile.pdf", qp, width = 20, height = 15, units = c("cm")) | |
| 39 | |
| 40 # Plot complexity (just for one file, Galaxy compares with sim_size) | |
| 41 | |
| 42 cp <- dada2::plotComplexity(fwd) | |
| 43 ggsave("complexity_fwd.pdf", cp, width = 20, height = 15, units = c("cm")) | |
| 44 cp <- dada2::plotComplexity(rev) | |
| 45 ggsave("complexity_rev.pdf", cp, width = 20, height = 15, units = c("cm")) | |
| 46 cp <- dada2::plotComplexity(fwd[1]) | |
| 47 ggsave("complexity.pdf", cp, width = 20, height = 15, units = c("cm")) | |
| 48 | |
| 49 | |
| 50 # learn Errors | |
| 51 print("learnErrors") | |
| 52 err_fwd <- dada2::learnErrors(filt_fwd) | |
| 53 saveRDS(err_fwd, file = "learnErrors_R1.Rdata") | |
| 54 plot <- dada2::plotErrors(err_fwd) | |
| 55 ggsave("learnErrors_R1.pdf", plot, width = 20, height = 15, units = c("cm")) | |
| 56 | |
| 57 err_rev <- dada2::learnErrors(filt_rev) | |
| 58 saveRDS(err_rev, file = "learnErrors_R2.Rdata") | |
| 59 plot <- dada2::plotErrors(err_rev) | |
| 60 ggsave("learnErrors.pdf", plot, width = 20, height = 15, units = c("cm")) | |
| 61 | |
| 62 # dada | |
| 63 print("dada") | |
| 64 dada_fwd <- dada2::dada(filt_fwd, err_fwd) | |
| 65 dada_rev <- dada2::dada(filt_rev, err_rev) | |
| 66 for (id in sample_names) { | |
| 67 saveRDS(dada_fwd[[id]], file = paste("dada_", id, "_R1.Rdata", sep = "")) | |
| 68 saveRDS(dada_rev[[id]], file = paste("dada_", id, "_R2.Rdata", sep = "")) | |
| 69 } | |
| 70 | |
| 71 # merge pairs | |
| 72 print("mergePairs") | |
| 73 merged <- dada2::mergePairs(dada_fwd, filt_fwd, dada_rev, filt_rev) | |
| 74 for (id in sample_names) { | |
| 75 saveRDS(merged[[id]], file = paste("mergePairs_", id, ".Rdata", sep = "")) | |
| 76 } | |
| 77 | |
| 78 | |
| 79 # make sequence table | |
| 80 print("makeSequenceTable") | |
| 81 seqtab <- makeSequenceTable(merged) | |
| 82 write.table(t(seqtab), file = "makeSequenceTable.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA) | |
| 83 | |
| 84 reads_per_seqlen <- tapply(colSums(seqtab), factor(nchar(getSequences(seqtab))), sum) | |
| 85 df <- data.frame(length = as.numeric(names(reads_per_seqlen)), count = reads_per_seqlen) | |
| 86 pdf("makeSequenceTable.pdf") | |
| 87 ggplot(data = df, aes(x = length, y = count)) + | |
| 88 geom_col() + | |
| 89 theme_bw() | |
| 90 bequiet <- dev.off() | |
| 91 | |
| 92 # remove bimera | |
| 93 print("removeBimera") | |
| 94 seqtab_nochim <- dada2::removeBimeraDenovo(seqtab) | |
| 95 write.table(t(seqtab), file = "removeBimeraDenovo.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA) | |
| 96 | |
| 97 # assign taxonomy/species | |
| 98 tl <- "Level1,Level2,Level3,Level4,Level5" | |
| 99 tl <- strsplit(tl, ",")[[1]] | |
| 100 | |
| 101 set.seed(42) | |
| 102 print("assignTaxonomyAndSpecies") | |
| 103 taxa <- dada2::assignTaxonomy(seqtab_nochim, "reference.fa.gz", outputBootstraps = TRUE, taxLevels = tl, multithread = 1) | |
| 104 | |
| 105 taxa$tax <- dada2::addSpecies(taxa$tax, "reference_species.fa.gz") | |
| 106 write.table(taxa$tax, file = "assignTaxonomyAddspecies.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA) | |
| 107 | |
| 108 write.table(taxa$boot, file = "assignTaxonomyAddspecies_boot.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA) | |
| 109 | |
| 110 | |
| 111 ## Generate extra test data for parameter testing | |
| 112 print("alternatives") | |
| 113 dada2::filterAndTrim(fwd, c("filterAndTrim_single_F3D0_R1.fq.gz", "filterAndTrim_single_F3D141_R1.fq.gz"), rm.phix = TRUE, orient.fwd = "TACGG") | |
| 114 | |
| 115 dada2::filterAndTrim(fwd, c("filterAndTrim_single_trimmers_F3D0_R1.fq.gz", "filterAndTrim_single_trimmers_F3D141_R1.fq.gz"), truncQ = 30, truncLen = 2, trimLeft = 150, trimRight = 2) | |
| 116 | |
| 117 dada2::filterAndTrim(fwd, c("filterAndTrim_single_filters_F3D0_R1.fq.gz", "filterAndTrim_single_filters_F3D141_R1.fq.gz"), maxLen = 255, minLen = 60, maxN = 100, minQ = 13, maxEE = 1) | |
| 118 | |
| 119 | |
| 120 merged_nondef <- dada2::mergePairs(dada_fwd, filt_fwd, dada_rev, filt_rev, minOverlap = 8, maxMismatch = 1, justConcatenate = TRUE, trimOverhang = TRUE) | |
| 121 for (id in sample_names) { | |
| 122 saveRDS(merged_nondef[[id]], file = paste("mergePairs_", id, "_nondefault.Rdata", sep = "")) | |
| 123 } | |
| 124 rb_dada_fwd <- dada2::removeBimeraDenovo(dada_fwd[["F3D0_S188_L001"]]) | |
| 125 write.table(rb_dada_fwd, file = "removeBimeraDenovo_F3D0_dada_uniques.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = FALSE) | |
| 126 | |
| 127 rb_merged <- dada2::removeBimeraDenovo(merged, method = "pooled") | |
| 128 saveRDS(rb_merged, file = "removeBimeraDenovo_F3D0_mergepairs.Rdata") | |
| 129 | |
| 130 # SeqCounts | |
| 131 get_n <- function(x) { | |
| 132 sum(dada2::getUniques(x)) | |
| 133 } | |
| 134 | |
| 135 print("seqCounts ft") | |
| 136 samples <- list() | |
| 137 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header = TRUE, sep = "\t", row.names = 1) | |
| 138 dname <- "filter" | |
| 139 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]] | |
| 140 names(tdf) <- paste(dname, names(tdf)) | |
| 141 tdf <- cbind(data.frame(samples = names(samples)), tdf) | |
| 142 write.table(tdf, "seqCounts_filter.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
| 143 | |
| 144 samples <- list() | |
| 145 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header = TRUE, sep = "\t", row.names = 1) | |
| 146 samples[["F3D141_S207_L001_R1_001.tab"]] <- read.table("F3D141_S207_L001_R1_001.tab", header = TRUE, sep = "\t", row.names = 1) | |
| 147 dname <- "filter" | |
| 148 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]] | |
| 149 tdf <- rbind(tdf, samples[["F3D141_S207_L001_R1_001.tab"]]) | |
| 150 names(tdf) <- paste(dname, names(tdf)) | |
| 151 tdf <- cbind(data.frame(samples = names(samples)), tdf) | |
| 152 write.table(tdf, "seqCounts_filter_both.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
| 153 | |
| 154 print("seqCounts dada") | |
| 155 samples <- list() | |
| 156 samples[["dada_F3D0_S188_L001_R1.Rdata"]] <- readRDS("dada_F3D0_S188_L001_R1.Rdata") | |
| 157 samples[["dada_F3D141_S207_L001_R1.Rdata"]] <- readRDS("dada_F3D141_S207_L001_R1.Rdata") | |
| 158 dname <- "dadaF" | |
| 159 tdf <- data.frame(samples = names(samples)) | |
| 160 tdf[[dname]] <- sapply(samples, get_n) | |
| 161 write.table(tdf, "seqCounts_dadaF.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
| 162 | |
| 163 print("seqCounts mp") | |
| 164 samples <- list() | |
| 165 samples[["mergePairs_F3D0_S188_L001.Rdata"]] <- readRDS("mergePairs_F3D0_S188_L001.Rdata") | |
| 166 samples[["mergePairs_F3D141_S207_L001.Rdata"]] <- readRDS("mergePairs_F3D141_S207_L001.Rdata") | |
| 167 dname <- "merge" | |
| 168 tdf <- data.frame(samples = names(samples)) | |
| 169 tdf[[dname]] <- sapply(samples, get_n) | |
| 170 write.table(tdf, "seqCounts_merge.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
| 171 | |
| 172 print("seqCounts st") | |
| 173 samples <- list() | |
| 174 samples <- t(as.matrix(read.table("makeSequenceTable.tab", header = TRUE, sep = "\t", row.names = 1))) | |
| 175 dname <- "seqtab" | |
| 176 tdf <- data.frame(samples = row.names(samples)) | |
| 177 tdf[[dname]] <- rowSums(samples) | |
| 178 write.table(tdf, "seqCounts_seqtab.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
| 179 | |
| 180 print("seqCounts rb") | |
| 181 samples <- list() | |
| 182 samples <- t(as.matrix(read.table("removeBimeraDenovo.tab", header = TRUE, sep = "\t", row.names = 1))) | |
| 183 dname <- "nochim" | |
| 184 tdf <- data.frame(samples = row.names(samples)) | |
| 185 tdf[[dname]] <- rowSums(samples) | |
| 186 write.table(tdf, "seqCounts_nochim.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) |
