Mercurial > repos > matthias > dada2_learnerrors
diff data2.R @ 0:56d5be6c03b9 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit d63c84012410608b3b5d23e130f0beff475ce1f8-dirty
| author | matthias | 
|---|---|
| date | Fri, 08 Mar 2019 06:30:11 -0500 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data2.R Fri Mar 08 06:30:11 2019 -0500 @@ -0,0 +1,352 @@ +library(dada2) + +# library(DBI) +# library(ggplot2) +library(optparse) +# library(RSQLite) +# library(stringr) + +## source required R functions +source('user_input_functions.R') + +# print dada2 version +print(paste("dada2 version: ", packageVersion("dada2"))) + +# # R function to create fasta file from dada2 output data +# outdir is directory to output fasta file +# taxa is taxonomy file generated by dada2 +# prefix is string for desired prefix attached to output file names + +dada2fasta <- function(outdir, seqtab.nochim, prefix){ + seq <- colnames(seqtab.nochim) + n <- 0 + ASVs <- c() + for(i in seq){ + n <- n + 1 + ASV <- paste('ASV', as.character(n), sep = '_') + ASVs <- c(ASVs, ASV) + line1 <- paste('>',ASV,sep='') + write(line1, file.path(outdir,sprintf('%s.fasta',prefix)), append=T) + write(i, file.path(outdir,sprintf('%s.fasta',prefix)), append=T) + } + return(ASVs) +} + + +# # R DADA2 workflow +# wd is path to fastq files +# r_path is path to user_input_functions.R +# outdir is path to output directory +# prefix is string for desired prefix attached to output file names + +dada2run <- function(wd, r_path, outdir, prefix){ + + # read-in files------------------------------------------------------- + ## obtain vectors of forward and reverse reads based on 'R1' and 'R2' in file names + ## additionally obtain the coressponding sample names for these files + p1 <- c() + p2 <- c() + sample.names <- c() + for(f in list.files(wd, full.names=T)){ + if(grepl('_1.fq', f)){ + sample <- gsub('^.*[/](.*?)_1\\.fq\\.gz', '\\1', f) + sample.names <- c(sample.names, sample) + p1 <- c(p1, f) + } + if(grepl('_2.fq', f)){ + p2 <- c(p2, f) + } + } + fnFs <- sort(p1) + fnRs <- sort(p2) + sample.names <- sort(sample.names) + + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_test.Rdata'))) + #load(file = file.path(outdir, paste0(prefix, 'state_test.Rdata'))) + + ## print for review + to_view <- data.frame(sample.names, fnFs, fnRs) + cat("The following fastq files were found:\n") + print(to_view) + + # Perform quality filtering and trimming--------------------------------- + ## assign new names to samples + filtFs <- file.path(outdir, paste0(sample.names, 'F_filt.fastq.gz')) + filtRs <- file.path(outdir, paste0(sample.names, 'R_filt.fastq.gz')) + + ## plot forward and reverse quality so that user can decide on filtering parameters + cat('Plotting quality profile of forward reads...\n') + Fqp1 <- plotQualityProfile(fnFs[1]) + #print(Fqp1) + ggsave(sprintf('%s_forward_1_qualityprofile.pdf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_forward_1_qualityprofile.emf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm")) + Fqp2 <- plotQualityProfile(fnFs[2]) + #print(Fqp2) + ggsave(sprintf('%s_forward_2_qualityprofile.pdf',prefix),Fqp2, path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_forward_2_qualityprofile.emf',prefix), Fqp2, path = outdir, width = 20,height = 15,units = c("cm")) + #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n') + len1 <- 240 + cat('Plotting quality profile of reverse reads...\n') + Rqp1 <- plotQualityProfile(fnRs[1]) + #print(Rqp1) + ggsave(sprintf('%s_reverse_1_qualityprofile.pdf',prefix),Rqp1, path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_reverse_1_qualityprofile.emf',prefix), Rqp1, path = outdir, width = 20,height = 15,units = c("cm")) + Rqp2 <- plotQualityProfile(fnRs[2]) + #print(Rqp2) + ggsave(sprintf('%s_reverse_2_qualityprofile.pdf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_reverse_2_qualityprofile.emf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm")) + #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n') + len2 <- 240 + + ## filter and trim + ## remaining parameters set to recommended defaults + ## maxN must be set to 0 (DADA2 requries no Ns) + ## The maxEE parameter sets the maximum number of "expected errors" allowed in a read, which is a better filter than simply averaging quality scores. + ## If not using Windows, you may set multithread to TRUE + ## NOTE: Do not use the truncLen parameter for ITS sequencing + ## trimLeft needs to be varied based on the size of your primers (i.e. it is used to trim your primer sequences)!! + cat('Filtering and trimming sequences...\n') + out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=T, compress=T, multithread=threads, trimLeft=15) + + ## have user review read count changes, and relax error rate if too many reads are lost + ## for example, you may especially want to relax the number of expected errors on the reverse reads (i.e. maxEE=c(2,5)), as the reverse is prone to error on the Illumina sequencing platform + print(head(out)) + check2 <- T + while(check2 == F){ + maxF <- numeric_input("What would you like the maximum number of expected errors in the forward reads to be?\nDefault 2:", 2) + maxR <- numeric_input("What would you like the maximum number of expected errors in the reverse reads to be?\nDefault 5:", 5) + out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(maxF,maxR), truncQ=2, rm.phix=T, compress=T, multithread=threads) + print(head(out)) + check2 <- yn_input('Proceed? If you lost too many reads, you can choose to not proceed and you will have the option to relax the error rate. Default yes:',T) + } + + # Have DADA2 learn the error rates----------------------------------------------- + ## If not using Windows, you may set multithread to TRUE + read.subset <- 1e6 + cat('Learning error rate of forward reads...\n') + errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads) + + ## have user check estimated error rates + ## note the calculations are done with a subset of the total reads, as it is computationally intensive + ## if the fit is poor, the user has the option to try again with an increased subset number of reads + Error_f <- plotErrors(errF, nominalQ = T) + #print(Error_f) + ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_forward_Error_plot.emf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm")) + check3a <- T + while(check3a == F){ + read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7) + errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads) + print(Error_f) + ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) + ggsave(sprintf('%s_forward_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) + check3a <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T) + } + + + ## also do for reverseL + cat('Learning error rate of reverse reads...\n') + errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads) + Error_r <- plotErrors(errR, nominalQ=T) + #print(Error_r) + ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm")) + check3b <- T + while(check3b == F){ + read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7) + errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads) + print(Error_r) + ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) + #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) + check3b <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T) + } + + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata'))) + #load(file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata'))) + + # Dereplicate sequences to generate unique sequence fastq files with corresponding count tables------------------------- + ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details. + cat('Dereplicating forward reads...\n') + derepFs <- derepFastq(filtFs, verbose=T) + cat('Dereplicating reverse reads...\n') + derepRs <- derepFastq(filtRs, verbose=T) + + ## name the derep-class objects by sample names + names(derepFs) <- sample.names + names(derepRs) <- sample.names + + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata'))) + #load(file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata'))) + + # Infer sequence variants using learned error rates--------------------------------- + ## If not using Windows, you may set multithread to TRUE + ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details. + ## NOTE2: you can use DADA2 for 454 or IonTorrent data as well. Please see https://benjjneb.github.io/dada2/tutorial.html. + s.pool = F + cat('Inferring sequence variants of forward reads...\n') + dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads) + + ## have user inspect detected number of sequence variants, to ensure it is logical based on the biological context of their samples + ## if you have low sampling depths, you may not want to process each sample independently as per default, but set pool=T. It gives better results at increased computation time. The user will have the option to do this if the number of sequence variants doesn't make sense. + print(dadaFs[[1]]) + check4 <- T + if(check4 == F){ + s.pool = T + dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads) + print(dadaFs[[1]]) + cat('Hopefully, these results make more sense.\nOtherwise, there is not much more you can do except start over!\n') + check <- yn_input('Proceed? Default yes, no to quit:',T) + if(check == F){ + stop() + } + } + + ## also do for reverse, but don't need to re-check as you need to keep the pool consistent between the forward and reverse! + cat('Inferring sequence variants of reversed reads...\n') + dadaRs <- dada(derepRs, err=errR, pool=s.pool, multithread=threads) + + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata'))) + #load(file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata'))) + + + # Merge forward and reverse reads------------------------------------------------- + cat('Merging paired-end reads...\n') + mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=T) + #cat('Most of your reads should have been retained (i.e. were able to merge, see above).\nOtherwise, there is not much more you can do except start over (i.e. did you over-trim your sequences??)!\n') + check <- T + if(check == F){ + stop() + } + + # Construct sequences table------------------------------------------------------- + cat('Constructing sequence table...\n') + seqtab <- makeSequenceTable(mergers) + + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata'))) + #load(file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata'))) + + + ## inspect distribution of sequence lengths + ## give user the option to filter out overly long or short sequneces + cat('Sequence length distribution listed below:\n') + print(table(nchar(getSequences(seqtab)))) + check5 <- T + if(check5 == F){ + min.cutoff <- numeric_input('Please input desired minimum length of sequences:',NULL) + max.cutoff <- numeric_input('Please input desired maximum length of sequences:',NULL) + seqtab <- seqtab[,nchar(colnames(seqtab)) %in% seq(min.cutoff,max.cutoff)] + } + + # Remove chimeras------------------------------------------------------------------ + ## If not using Windows, you may set multithread to TRUE + cat('Removing chimeras...\n') + seqtab.nochim <- removeBimeraDenovo(seqtab, method='consensus', multithread=threads, verbose=T) + + ## display percentage of chimeras removed + ## this number should be small (<5%), otherwise some processing parameters need to be revisited + percent.nochim <- (sum(seqtab.nochim)/sum(seqtab))*100 + percent.nochim <- paste(as.character(percent.nochim),'of reads retained after chimera removal.\n',sep=' ') + cat(percent.nochim) + #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n') + check <- T + if(check == F){ + stop() + } + + # Final sanity check-------------------------------------------------------------- + ## track reads removed throughout the pipeline + ## If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) + getN <- function(x) sum(getUniques(x)) + track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim)) + colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim") + rownames(track) <- sample.names + print(head(track)) + #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n') + check <- T + if(check == F){ + stop() + } + + write.csv(track,file=file.path(outdir, sprintf('%s_read_count-quality_control.csv',prefix))) + + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata'))) +# load(file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata'))) + + # Assign taxonomy----------------------------------------------------------------- + ## require silva database files + ## If not using Windows, you may set multithread to TRUE + ## Minimum boot strap should be 80, but for sequnce length =< 250 Minimum bootstrap set to 50 (which is also the default) + + ## SILVA + cat('Assigning taxonomy to genus level using SILVA...\n') + taxa_silva <- assignTaxonomy(seqtab.nochim, file.path(wd,"silva_nr_v132_train_set.fa.gz"), multithread=threads, minBoot=80, tryRC=T) + cat('Assigning taxonomy at species level using SILVA...\n') + taxa_silva <- addSpecies(taxa_silva, file.path(wd,"silva_species_assignment_v132.fa.gz"), allowMultiple=T, tryRC=T) + write.csv(taxa_silva,file=file.path(outdir, sprintf('%s_taxonomy_silva.csv',prefix))) + + ## RDP - used for copy number correction + cat('Assigning taxonomy to genus level using RDP...\n') + taxa_rdp <- assignTaxonomy(seqtab.nochim, file.path(wd,"rdp_train_set_16.fa.gz"), multithread=threads, minBoot=80, tryRC=T) + cat('Assigning taxonomy at species level using RDP...\n') + taxa_rdp <- addSpecies(taxa_rdp, file.path(wd,"rdp_species_assignment_16.fa.gz"), allowMultiple=T, tryRC=T) + write.csv(taxa_rdp,file=file.path(outdir, sprintf('%s_taxonomy_rdp.csv',prefix))) + save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata'))) + #load(file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata'))) + + + # Return data---------------------------------------------------------------------- + cat('Returning data...\n') + ## create fasta file + ASVs <- dada2fasta(outdir, seqtab.nochim, prefix) + ## create master dataframe for each classification + + ## Assigning ASVs to count table + sequences <- colnames(seqtab.nochim) + colnames(seqtab.nochim) <- ASVs + seqtab.nochim <- t(seqtab.nochim) + + ## silva + taxa_silva <- taxa_silva[match(sequences, rownames(taxa_silva)),] + rownames(taxa_silva) <- ASVs + d <- merge(seqtab.nochim, taxa_silva, by='row.names') + colnames(d)[1] <- 'ASV' + ## create database of all information + db <- dbConnect(RSQLite::SQLite(), file.path(outdir, sprintf('%s.sqlite',prefix))) + dbWriteTable(db, 'dada2_results_silva', d) + ## write master dataframe for user, and return it + write.table(d, file.path(outdir, sprintf('%s_dada2_results_silva.txt',prefix)), sep='\t', quote=F, row.names=F) + + ## rdp + taxa_rdp <- taxa_rdp[match(sequences, rownames(taxa_rdp)),] + rownames(taxa_rdp) <- ASVs + d <- merge(seqtab.nochim, taxa_rdp, by='row.names') + colnames(d)[1] <- 'ASV' + ## create database of all information + dbWriteTable(db, 'dada2_results_rdp', d) + dbDisconnect(db) + ## write master dataframe for user, and return it + write.table(d, file.path(outdir, sprintf('%s_dada2_results_rdp.txt',prefix)), sep='\t', quote=F, row.names=F) + return(d) + + cat('DADA2 processing completed!\n') +} + + +option_list = list( + make_option(c("-t", "--threads"), type = "integer", default = 1, + help = "number of threads to use", metavar = "THREADS") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +print(opt) + +threads = as.integer(Sys.getenv("NSLOTS", "1")) + +exit(1) + + + +dada2run(wd='/work/haange/Leaky_gut/', r_path='/work/haange/Leaky_gut', outdir='/work/haange/Leaky_gut/results', prefix='Leaky_gut') +
