Mercurial > repos > matthias > dada2_plotqualityprofile
changeset 11:36224cf72a7b draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit b59ed2476188176bb5b2973f59899b7d733e7641-dirty
| author | matthias | 
|---|---|
| date | Thu, 29 Aug 2019 09:02:34 -0400 | 
| parents | 279014cdd101 | 
| children | c06c6f88da97 | 
| files | dada2.pdf dada2_plotQualityProfile.xml data2.R macros.xml notes.txt static/images/pairpipe.png static/images/pairpipe.svg user_input_functions.R | 
| diffstat | 8 files changed, 30 insertions(+), 703 deletions(-) [+] | 
line wrap: on
 line diff
--- a/dada2_plotQualityProfile.xml Tue May 28 12:47:08 2019 -0400 +++ b/dada2_plotQualityProfile.xml Thu Aug 29 09:02:34 2019 -0400 @@ -1,9 +1,10 @@ -<tool id="dada2_plotQualityProfile" name="dada2: plotQualityProfile" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@"> +<tool id="dada2_plotQualityProfile" name="dada2: plotQualityProfile" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@" profile="19.09"> <description>plot a visual summary of the quality scores</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"/> + <expand macro="stdio"/> <expand macro="version_command"/> <command detect_errors="exit_code"><![CDATA[ ##name files by linking @@ -175,6 +176,8 @@ The distribution of quality scores at each position is shown as a grey-scale heat map, with dark colors corresponding to higher frequency. The plotted lines show positional summary statistics: green is the mean, orange is the median, and the dashed orange lines are the 25th and 75th quantiles. If the sequences vary in length, a red line will be plotted showing the percentage of reads that extend to at least that position. + +@HELP_OVERVIEW@ ]]></help> <expand macro="citations"/> </tool>
--- a/data2.R Tue May 28 12:47:08 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,352 +0,0 @@ -library(dada2) - -# library(DBI) -# library(ggplot2) -library(optparse) -# library(RSQLite) -# library(stringr) - -## source required R functions -source('user_input_functions.R') - -# print dada2 version -print(paste("dada2 version: ", packageVersion("dada2"))) - -# # R function to create fasta file from dada2 output data -# outdir is directory to output fasta file -# taxa is taxonomy file generated by dada2 -# prefix is string for desired prefix attached to output file names - -dada2fasta <- function(outdir, seqtab.nochim, prefix){ - seq <- colnames(seqtab.nochim) - n <- 0 - ASVs <- c() - for(i in seq){ - n <- n + 1 - ASV <- paste('ASV', as.character(n), sep = '_') - ASVs <- c(ASVs, ASV) - line1 <- paste('>',ASV,sep='') - write(line1, file.path(outdir,sprintf('%s.fasta',prefix)), append=T) - write(i, file.path(outdir,sprintf('%s.fasta',prefix)), append=T) - } - return(ASVs) -} - - -# # R DADA2 workflow -# wd is path to fastq files -# r_path is path to user_input_functions.R -# outdir is path to output directory -# prefix is string for desired prefix attached to output file names - -dada2run <- function(wd, r_path, outdir, prefix){ - - # read-in files------------------------------------------------------- - ## obtain vectors of forward and reverse reads based on 'R1' and 'R2' in file names - ## additionally obtain the coressponding sample names for these files - p1 <- c() - p2 <- c() - sample.names <- c() - for(f in list.files(wd, full.names=T)){ - if(grepl('_1.fq', f)){ - sample <- gsub('^.*[/](.*?)_1\\.fq\\.gz', '\\1', f) - sample.names <- c(sample.names, sample) - p1 <- c(p1, f) - } - if(grepl('_2.fq', f)){ - p2 <- c(p2, f) - } - } - fnFs <- sort(p1) - fnRs <- sort(p2) - sample.names <- sort(sample.names) - - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_test.Rdata'))) - #load(file = file.path(outdir, paste0(prefix, 'state_test.Rdata'))) - - ## print for review - to_view <- data.frame(sample.names, fnFs, fnRs) - cat("The following fastq files were found:\n") - print(to_view) - - # Perform quality filtering and trimming--------------------------------- - ## assign new names to samples - filtFs <- file.path(outdir, paste0(sample.names, 'F_filt.fastq.gz')) - filtRs <- file.path(outdir, paste0(sample.names, 'R_filt.fastq.gz')) - - ## plot forward and reverse quality so that user can decide on filtering parameters - cat('Plotting quality profile of forward reads...\n') - Fqp1 <- plotQualityProfile(fnFs[1]) - #print(Fqp1) - ggsave(sprintf('%s_forward_1_qualityprofile.pdf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_forward_1_qualityprofile.emf',prefix), Fqp1, path = outdir, width = 20,height = 15,units = c("cm")) - Fqp2 <- plotQualityProfile(fnFs[2]) - #print(Fqp2) - ggsave(sprintf('%s_forward_2_qualityprofile.pdf',prefix),Fqp2, path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_forward_2_qualityprofile.emf',prefix), Fqp2, path = outdir, width = 20,height = 15,units = c("cm")) - #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n') - len1 <- 240 - cat('Plotting quality profile of reverse reads...\n') - Rqp1 <- plotQualityProfile(fnRs[1]) - #print(Rqp1) - ggsave(sprintf('%s_reverse_1_qualityprofile.pdf',prefix),Rqp1, path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_reverse_1_qualityprofile.emf',prefix), Rqp1, path = outdir, width = 20,height = 15,units = c("cm")) - Rqp2 <- plotQualityProfile(fnRs[2]) - #print(Rqp2) - ggsave(sprintf('%s_reverse_2_qualityprofile.pdf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_reverse_2_qualityprofile.emf',prefix), Rqp2, path = outdir, width = 20,height = 15,units = c("cm")) - #cat('Which position would you like to truncate the forward reads at?\nPlease use the red-dashed lines as a guide, where they stop appearing indicates good quality.\nNOTE: Do NOT over-trim! You still require overlap between your forward and reverse reads in order to merge them later!\n') - len2 <- 240 - - ## filter and trim - ## remaining parameters set to recommended defaults - ## maxN must be set to 0 (DADA2 requries no Ns) - ## The maxEE parameter sets the maximum number of "expected errors" allowed in a read, which is a better filter than simply averaging quality scores. - ## If not using Windows, you may set multithread to TRUE - ## NOTE: Do not use the truncLen parameter for ITS sequencing - ## trimLeft needs to be varied based on the size of your primers (i.e. it is used to trim your primer sequences)!! - cat('Filtering and trimming sequences...\n') - out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=T, compress=T, multithread=threads, trimLeft=15) - - ## have user review read count changes, and relax error rate if too many reads are lost - ## for example, you may especially want to relax the number of expected errors on the reverse reads (i.e. maxEE=c(2,5)), as the reverse is prone to error on the Illumina sequencing platform - print(head(out)) - check2 <- T - while(check2 == F){ - maxF <- numeric_input("What would you like the maximum number of expected errors in the forward reads to be?\nDefault 2:", 2) - maxR <- numeric_input("What would you like the maximum number of expected errors in the reverse reads to be?\nDefault 5:", 5) - out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(len1,len2), maxN=0, maxEE=c(maxF,maxR), truncQ=2, rm.phix=T, compress=T, multithread=threads) - print(head(out)) - check2 <- yn_input('Proceed? If you lost too many reads, you can choose to not proceed and you will have the option to relax the error rate. Default yes:',T) - } - - # Have DADA2 learn the error rates----------------------------------------------- - ## If not using Windows, you may set multithread to TRUE - read.subset <- 1e6 - cat('Learning error rate of forward reads...\n') - errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads) - - ## have user check estimated error rates - ## note the calculations are done with a subset of the total reads, as it is computationally intensive - ## if the fit is poor, the user has the option to try again with an increased subset number of reads - Error_f <- plotErrors(errF, nominalQ = T) - #print(Error_f) - ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_forward_Error_plot.emf',prefix), Error_f, path = outdir, width = 20,height = 15,units = c("cm")) - check3a <- T - while(check3a == F){ - read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7) - errF <- learnErrors(filtFs, nreads=read.subset, multithread=threads) - print(Error_f) - ggsave(sprintf('%s_forward_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) - ggsave(sprintf('%s_forward_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) - check3a <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T) - } - - - ## also do for reverseL - cat('Learning error rate of reverse reads...\n') - errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads) - Error_r <- plotErrors(errR, nominalQ=T) - #print(Error_r) - ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), Error_r, path = outdir, width = 20,height = 15,units = c("cm")) - check3b <- T - while(check3b == F){ - read.subset <- numeric_input('Please specify the number of reads you would like dada2 to utilize to calculate the error rate.\nThe default previously used was 1e6.\nThe newly recommended default is 10-fold greater,\n1e7:',1e7) - errR <- learnErrors(filtRs, nreads=read.subset, multithread=threads) - print(Error_r) - ggsave(sprintf('%s_reverse_Error_plot.pdf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) - #ggsave(sprintf('%s_reverse_Error_plot.emf',prefix), path = outdir, width = 20,height = 15,units = c("cm")) - check3b <- yn_input('Proceed?\nThe estimated error rate (black line) should fit to the observed error rates for each consensus quality score (black points).\nAdditionally, the error rates expected under the nominal definition of the Q-value (quality score) should decrease as the quality score increases (or flat-line).\nIf you do not have a good fit, you may want dada2 to re-learn the error rates with a higher number of reads in the utilized subset.\nA subset of reads is always used as the algorithm is computationally intensive.\nDefault yes:',T) - } - - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata'))) - #load(file = file.path(outdir, paste0(prefix, 'state_post_learning.Rdata'))) - - # Dereplicate sequences to generate unique sequence fastq files with corresponding count tables------------------------- - ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details. - cat('Dereplicating forward reads...\n') - derepFs <- derepFastq(filtFs, verbose=T) - cat('Dereplicating reverse reads...\n') - derepRs <- derepFastq(filtRs, verbose=T) - - ## name the derep-class objects by sample names - names(derepFs) <- sample.names - names(derepRs) <- sample.names - - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata'))) - #load(file = file.path(outdir, paste0(prefix, 'state_post_derep.Rdata'))) - - # Infer sequence variants using learned error rates--------------------------------- - ## If not using Windows, you may set multithread to TRUE - ## NOTE: if your dataset is huge, you may run out of RAM. Please see https://benjjneb.github.io/dada2/bigdata.html for details. - ## NOTE2: you can use DADA2 for 454 or IonTorrent data as well. Please see https://benjjneb.github.io/dada2/tutorial.html. - s.pool = F - cat('Inferring sequence variants of forward reads...\n') - dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads) - - ## have user inspect detected number of sequence variants, to ensure it is logical based on the biological context of their samples - ## if you have low sampling depths, you may not want to process each sample independently as per default, but set pool=T. It gives better results at increased computation time. The user will have the option to do this if the number of sequence variants doesn't make sense. - print(dadaFs[[1]]) - check4 <- T - if(check4 == F){ - s.pool = T - dadaFs <- dada(derepFs, err=errF, pool=s.pool, multithread=threads) - print(dadaFs[[1]]) - cat('Hopefully, these results make more sense.\nOtherwise, there is not much more you can do except start over!\n') - check <- yn_input('Proceed? Default yes, no to quit:',T) - if(check == F){ - stop() - } - } - - ## also do for reverse, but don't need to re-check as you need to keep the pool consistent between the forward and reverse! - cat('Inferring sequence variants of reversed reads...\n') - dadaRs <- dada(derepRs, err=errR, pool=s.pool, multithread=threads) - - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata'))) - #load(file = file.path(outdir, paste0(prefix, 'state_post_dada.Rdata'))) - - - # Merge forward and reverse reads------------------------------------------------- - cat('Merging paired-end reads...\n') - mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=T) - #cat('Most of your reads should have been retained (i.e. were able to merge, see above).\nOtherwise, there is not much more you can do except start over (i.e. did you over-trim your sequences??)!\n') - check <- T - if(check == F){ - stop() - } - - # Construct sequences table------------------------------------------------------- - cat('Constructing sequence table...\n') - seqtab <- makeSequenceTable(mergers) - - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata'))) - #load(file = file.path(outdir, paste0(prefix, 'state_post_merge.Rdata'))) - - - ## inspect distribution of sequence lengths - ## give user the option to filter out overly long or short sequneces - cat('Sequence length distribution listed below:\n') - print(table(nchar(getSequences(seqtab)))) - check5 <- T - if(check5 == F){ - min.cutoff <- numeric_input('Please input desired minimum length of sequences:',NULL) - max.cutoff <- numeric_input('Please input desired maximum length of sequences:',NULL) - seqtab <- seqtab[,nchar(colnames(seqtab)) %in% seq(min.cutoff,max.cutoff)] - } - - # Remove chimeras------------------------------------------------------------------ - ## If not using Windows, you may set multithread to TRUE - cat('Removing chimeras...\n') - seqtab.nochim <- removeBimeraDenovo(seqtab, method='consensus', multithread=threads, verbose=T) - - ## display percentage of chimeras removed - ## this number should be small (<5%), otherwise some processing parameters need to be revisited - percent.nochim <- (sum(seqtab.nochim)/sum(seqtab))*100 - percent.nochim <- paste(as.character(percent.nochim),'of reads retained after chimera removal.\n',sep=' ') - cat(percent.nochim) - #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n') - check <- T - if(check == F){ - stop() - } - - # Final sanity check-------------------------------------------------------------- - ## track reads removed throughout the pipeline - ## If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs) - getN <- function(x) sum(getUniques(x)) - track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim)) - colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim") - rownames(track) <- sample.names - print(head(track)) - #cat('Most of your reads should have been retained.\nOtherwise, there is not much more you can do except start over!\n') - check <- T - if(check == F){ - stop() - } - - write.csv(track,file=file.path(outdir, sprintf('%s_read_count-quality_control.csv',prefix))) - - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata'))) -# load(file = file.path(outdir, paste0(prefix, 'state_post_chimera.Rdata'))) - - # Assign taxonomy----------------------------------------------------------------- - ## require silva database files - ## If not using Windows, you may set multithread to TRUE - ## Minimum boot strap should be 80, but for sequnce length =< 250 Minimum bootstrap set to 50 (which is also the default) - - ## SILVA - cat('Assigning taxonomy to genus level using SILVA...\n') - taxa_silva <- assignTaxonomy(seqtab.nochim, file.path(wd,"silva_nr_v132_train_set.fa.gz"), multithread=threads, minBoot=80, tryRC=T) - cat('Assigning taxonomy at species level using SILVA...\n') - taxa_silva <- addSpecies(taxa_silva, file.path(wd,"silva_species_assignment_v132.fa.gz"), allowMultiple=T, tryRC=T) - write.csv(taxa_silva,file=file.path(outdir, sprintf('%s_taxonomy_silva.csv',prefix))) - - ## RDP - used for copy number correction - cat('Assigning taxonomy to genus level using RDP...\n') - taxa_rdp <- assignTaxonomy(seqtab.nochim, file.path(wd,"rdp_train_set_16.fa.gz"), multithread=threads, minBoot=80, tryRC=T) - cat('Assigning taxonomy at species level using RDP...\n') - taxa_rdp <- addSpecies(taxa_rdp, file.path(wd,"rdp_species_assignment_16.fa.gz"), allowMultiple=T, tryRC=T) - write.csv(taxa_rdp,file=file.path(outdir, sprintf('%s_taxonomy_rdp.csv',prefix))) - save(list = ls(all.names = TRUE), file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata'))) - #load(file = file.path(outdir, paste0(prefix, 'state_post_tax.Rdata'))) - - - # Return data---------------------------------------------------------------------- - cat('Returning data...\n') - ## create fasta file - ASVs <- dada2fasta(outdir, seqtab.nochim, prefix) - ## create master dataframe for each classification - - ## Assigning ASVs to count table - sequences <- colnames(seqtab.nochim) - colnames(seqtab.nochim) <- ASVs - seqtab.nochim <- t(seqtab.nochim) - - ## silva - taxa_silva <- taxa_silva[match(sequences, rownames(taxa_silva)),] - rownames(taxa_silva) <- ASVs - d <- merge(seqtab.nochim, taxa_silva, by='row.names') - colnames(d)[1] <- 'ASV' - ## create database of all information - db <- dbConnect(RSQLite::SQLite(), file.path(outdir, sprintf('%s.sqlite',prefix))) - dbWriteTable(db, 'dada2_results_silva', d) - ## write master dataframe for user, and return it - write.table(d, file.path(outdir, sprintf('%s_dada2_results_silva.txt',prefix)), sep='\t', quote=F, row.names=F) - - ## rdp - taxa_rdp <- taxa_rdp[match(sequences, rownames(taxa_rdp)),] - rownames(taxa_rdp) <- ASVs - d <- merge(seqtab.nochim, taxa_rdp, by='row.names') - colnames(d)[1] <- 'ASV' - ## create database of all information - dbWriteTable(db, 'dada2_results_rdp', d) - dbDisconnect(db) - ## write master dataframe for user, and return it - write.table(d, file.path(outdir, sprintf('%s_dada2_results_rdp.txt',prefix)), sep='\t', quote=F, row.names=F) - return(d) - - cat('DADA2 processing completed!\n') -} - - -option_list = list( - make_option(c("-t", "--threads"), type = "integer", default = 1, - help = "number of threads to use", metavar = "THREADS") -); - -opt_parser = OptionParser(option_list=option_list); -opt = parse_args(opt_parser); - -print(opt) - -threads = as.integer(Sys.getenv("NSLOTS", "1")) - -exit(1) - - - -dada2run(wd='/work/haange/Leaky_gut/', r_path='/work/haange/Leaky_gut', outdir='/work/haange/Leaky_gut/results', prefix='Leaky_gut') -
--- a/macros.xml Tue May 28 12:47:08 2019 -0400 +++ b/macros.xml Thu Aug 29 09:02:34 2019 -0400 @@ -16,6 +16,13 @@ ]]></version_command> </xml> + <xml name="stdio"> + <stdio> + <regex match="Error: cannot allocate" source="stderr" level="fatal_oom" description="Out of memory error occurred" /> + <regex match="'Calloc' could not allocate memory" source="stderr" level="fatal_oom" description="Out of memory error occurred" /> + </stdio> + </xml> + <xml name="citations"> <citations> <citation type="doi">10.1038/nmeth.3869</citation>
--- a/notes.txt Tue May 28 12:47:08 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,148 +0,0 @@ -TODO -==== - - - -If we make a monolithic tool: - -* implement sanity checks between important compute intensive steps (user definable criteria, abort if violated) - -If we keep separate tools: - -- make Rdata data types specific (like xmcs https://github.com/workflow4metabolomics/xcms/tree/dev/datatypes) -* alternatively the data set types could be derived from tabular and the Rdata could be attached via - `.extra_files_path` this way the user could have some intermediate output that he could look at. - - -In both cases: - -* allow input of single end data, single pair, single pair in separate data sets, ... -* add mergePairsByID functionality to mergePairs tool - - -Datatypes: -========== - -**derep-class**: list w 3 members -- uniques: Named integer vector. Named by the unique sequence, valued by abundance. -• quals: Numeric matrix of average quality scores by position for each unique. Uniques are -rows, positions are cols. -* map: Integer vector of length the number of reads, and value the index (in uniques) of the -unique to which that read was assigned. - -**learnErrorsOutput**: A named list with three entries -- err_out: A numeric matrix with the learned error rates. -- err_in: The initialization error rates (unimportant). -- trans: A feature table of observed transitions for each type (eg. A->C) and quality score. - -**dada-class**: A multi-item List with the following named values... -• denoised: Integer vector, named by sequence valued by abundance, of the denoised sequences. -• clustering: An informative data.frame containing information on each cluster. -• sequence: A character vector of each denoised sequence. Identical to names(denoised). -• quality: The average quality scores for each cluster (row) by position (col). -• map: Integer vector that maps the unique (index of derep.unique) to the denoised sequence (index of dada.denoised). -• birth_subs: A data.frame containing the substitutions at the birth of each new cluster. -• trans: The matrix of transitions by type (row), eg. A2A, A2C..., and quality score (col) -observed in the final output of the dada algorithm. -• err_in: The err matrix used for this invocation of dada. -• err_out: The err matrix estimated from the output of dada. NULL if err_function not provided. -• opts: A list of the dada_opts used for this invocation of dada. -• call: The function call used for this invocation of dada. - -**uniques**: derep, dada, mergepairs(or data frame w sequenc and abundance columns) - -**mergepairs**: - -data.frame(s) has a row for each unique pairing of forward/reverse denoised sequences, and the following columns: -• abundance: Number of reads corresponding to this forward/reverse combination. -• sequence: The merged sequence. -• forward: The index of the forward denoised sequence. -• reverse: The index of the reverse denoised sequence. -• nmatch: Number of matches nts in the overlap region. -• nmismatch: Number of mismatches in the overlap region. -• nindel: Number of indels in the overlap region. -• prefer: The sequence used for the overlap region. 1=forward; 2=reverse. -• accept: TRUE if overlap between forward and reverse denoised sequences was at least minOverlap and had at most maxMismatch differences. FALSE otherwise. -• ...: Additional columns specified in propagateCol - - - -Tools: -====== - -• Quality filtering - - filterAndTrim IO=(fastq -> fastq) - -• Dereplication - - derepFastq (fastq -> derep-class object) - -• Learn error rates - - learnErrors + plotErrors - - in: input list, or vector, of file names (or a list of derep-class objects WHY .. learning should be done on full data) - - out: named list w entries - - \$err\_out: A numeric matrix with the learned error rates. - - \$err\_in: The initialization error rates (unimportant). - - \$trans: A feature table of observed transitions for each type (eg. A->C) and quality score - -• Sample Inference (dada) - in: (list of) derep-class object - out: (list of) dada-class object - -• Chimera Removal - - removeBimeraDenovo - - in: A uniques-vector or any object that can be coerced into one with getUniques. - out: A uniques vector, or an object of matching class if a data.frame or sequence table is provided - -• Merging of Paired Reads - - mergePairs - in: 2x dada-class object(s), 2x derep-class object(s) - out: A data.frame, or a list of data.frames. - - The return data.frame(s) has a row for each unique pairing of forward/reverse denoised sequences, - - cols - - \$abundance: Number of reads corresponding to this forward/reverse combination. - - \$sequence: The merged sequence. - - \$forward: The index of the forward denoised sequence. - - \$reverse: The index of the reverse denoised sequence. - - \$nmatch: Number of matches nts in the overlap region. - - \$nmismatch: Number of mismatches in the overlap region. - - \$nindel: Number of indels in the overlap region. - - \$prefer: The sequence used for the overlap region. 1=forward; 2=reverse. - - \$accept: TRUE if overlap between forward and reverse denoised sequences was at least minOverlap and had at most maxMismatch differences. FALSE otherwise. - - \$...: Additional columns specified in propagateCol. - -• Taxonomic Classification (assignTaxonomy, assignSpecies) - -* Other - - makeSequenceTable - in A list of the samples to include in the sequence table. Samples can be provided in any format that can be processed by getUniques - out Named integer matrix (row for each sample, column for each unique sequence) - - mergeSequenceTables - - uniquesToFasta - in: A uniques-vector or any object that can be coerced into one with getUniques. - - getSequences - - extracts the sequences from several different data objects: including including dada-class - and derep-class objects, as well as data.frame objects that have both \$sequence and \$abun- - dance columns. - - getUniques - - extracts the uniques-vector from several different data objects, including dada-class - and derep-class objects, as well as data.frame objects that have both \$sequence and \$abundance - columns - - plotQualityProfile - - seqComplexity - - setDadaOpt(...)
--- a/static/images/pairpipe.svg Tue May 28 12:47:08 2019 -0400 +++ b/static/images/pairpipe.svg Thu Aug 29 09:02:34 2019 -0400 @@ -45,12 +45,12 @@ inkscape:snap-bbox="false" inkscape:object-paths="true" inkscape:zoom="1" - inkscape:cx="685.79259" - inkscape:cy="205.1652" + inkscape:cx="654.80177" + inkscape:cy="201.1652" inkscape:window-x="0" inkscape:window-y="27" inkscape:window-maximized="1" - inkscape:current-layer="svg384" + inkscape:current-layer="layer1" fit-margin-top="0" fit-margin-left="0" fit-margin-right="0" @@ -919,7 +919,15 @@ inkscape:groupmode="layer" id="layer1" inkscape:label="Layer 1" - transform="translate(-178.97434,-516.80337)" /> + transform="translate(-178.97434,-516.80337)"> + <path + sodipodi:nodetypes="cc" + inkscape:connector-curvature="0" + id="path1104" + d="m 547.55976,25.196631 41,34" + style="stroke:#000000;marker-end:url(#id2)" + transform="translate(178.97434,516.80337)" /> + </g> <g inkscape:groupmode="layer" id="layer2" @@ -978,5 +986,11 @@ sodipodi:role="line" id="tspan1102" x="214" - y="-108.17315"></tspan></text> + y="-108.17315" /></text> + <path + style="stroke:#000000;marker-end:url(#id2)" + d="m 549.55976,123.19663 38,44" + id="path1255" + inkscape:connector-curvature="0" + sodipodi:nodetypes="cc" /> </svg>
--- a/user_input_functions.R Tue May 28 12:47:08 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,197 +0,0 @@ -# defining functions for checking user inputs - -# requesting directory input---------------------------------------------------------- -dir_input <- function(prompt) { - check <- FALSE - while(check == FALSE) { - user_input <- readline(prompt) - check <- dir.exists(user_input) - if(check==FALSE) { - msg <- sprintf("The directory: %s not found.", user_input) - message(msg) - } - } - return(user_input) -} - -# requesting file input -file_input <- function(prompt, directory) { - check <- FALSE - while(check == FALSE) { - user_input <- readline(prompt) - check <- file.exists(file.path(directory, user_input)) - if(check==FALSE) { - msg <- sprintf("File: %s not found.", user_input) - message(msg) - } - } - return(user_input) -} - -# requesting string input------------------------------------------------------------ -string_input <- function(prompt) { - check <- FALSE - while(check == FALSE) { - user_input <- readline(prompt) - check <- user_input!='' - if(check == FALSE) { - message("Input can't be empty.") - } - } - return(user_input) -} - -# requesting integer input---------------------------------------------------------- -numeric_input <- function(prompt, default) { - check <- FALSE - while(check == FALSE) { - user_input <- readline(prompt) - - # if blank, set user_input to defalut - if(user_input == '') { - user_input <- default - msg <- sprintf("No input supplied, default of %s used.", default) - message(msg) - } - # coerce input to be numeric - else { - user_input <- as.numeric(user_input) - } - # check if number supplied - check <- !is.na(user_input) - if(check == FALSE) { - message("Input must be a number.") - } - } - return(user_input) -} - -# request constrained string--------------------------------------------------------- -# default is numeric, referring to the index of desired option in 'choices' -cons_string_input <- function(prompt, choices, default) { - - if(missing(default)){ - check <- FALSE - while(check == FALSE) { - user_input <- as.numeric(menu(choices, graphics=FALSE, title=prompt)) - user_input <- choices[user_input] - - check <- user_input!='' - if(check == FALSE) message("Input can't be empty.") - else message(sprintf("\nSelected: %s", user_input)) - } - } - - if(!missing(default)){ - # setting up prompt with menu - num_choices <- str_c(1:length(choices), choices, sep='. ') - menu_prompt <- sprintf("%s\n\t%s\n", prompt, str_c(num_choices, collapse='\n\t')) - message(menu_prompt) - - # requesting user input - user_input <- readline("Selection:") - - # setting default - if(user_input == '') { - user_input <- as.numeric(default) - user_input <- choices[user_input] - msg <- sprintf("No input supplied, default of %s used.", user_input) - message(msg) - } - else { - user_input <- as.numeric(user_input) - user_input <- choices[user_input] - message(sprintf("\nSelected: %s", user_input)) - } - } - - return(user_input) -} - -# request multiple inputs----------------------------------------------------------- -## default is optional; can set default to NULL if want to allow blank entry -cons_string_mult <- function(prompt, choices, default) { - if(missing(default)) { - check <- FALSE - while(check == FALSE) { - user_input <- select.list(choices=choices, multiple=TRUE, title=prompt, - graphics = FALSE) - - if(length(user_input)==0) { - message("Input can't be empty.") - check <- FALSE - } - else { - message(sprintf("\nSelected: %s", user_input)) - check <- TRUE - } - } - } - - if(!missing(default)) { - user_input <- select.list(choices=choices, multiple=TRUE, title=prompt, - graphics=FALSE) - - if(length(user_input)==0) { - if(is.null(default)) user_input <- default - else user_input <- default - msg <- sprintf("No input supplied, using default:%s", - paste(user_input, collapse=', ')) - message(msg) - } - - } - - return(user_input) - -} - -# yesno input------------------------------------------------------------------- -# default is TRUE or FALSE -yn_input <- function(prompt, default) { - - choices <- c("Yes", "No") - - if(missing(default)) { - check <- FALSE - while(check == FALSE) { - user_input <- menu(choices, graphics=FALSE, title=prompt) - - if(length(user_input)==0) { - message("Input can't be empty.") - check <- FALSE - } - else { - message(sprintf("\nSelected: %s", user_input)) - check <- TRUE - } - } - user_input <- ifelse(user_input == 1L, TRUE, FALSE) - } - if(!missing(default)) { - - # setting up prompt with menu - num_choices <- str_c(1:length(choices), choices, sep='. ') - menu_prompt <- sprintf("%s\n\t%s\n", prompt, str_c(num_choices, collapse='\n\t')) - message(menu_prompt) - - # requesting user input - user_input <- readline("Selection:") - - # setting default - if(user_input == '') { - user_input <- as.numeric(default) - msg <- sprintf("No input supplied, default of %s used.", choices[user_input]) - message(msg) - user_input <- ifelse(user_input == 1L, TRUE, FALSE) - } - else { - user_input <- as.numeric(user_input) - message(sprintf("\nSelected: %s", choices[user_input])) - user_input <- ifelse(user_input == 1L, TRUE, FALSE) - } - } - - - return(user_input) -} \ No newline at end of file
