Mercurial > repos > iuc > deseq2
comparison deseq2.R @ 17:24930eeff731 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit 23559e873291b04840f186edb0b4b85e76359e05
| author | iuc |
|---|---|
| date | Thu, 03 Nov 2016 16:03:53 -0400 |
| parents | |
| children | a5e49c34745d |
comparison
equal
deleted
inserted
replaced
| 16:a7bf54ecfa9d | 17:24930eeff731 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 | |
| 3 # A command-line interface to DESeq2 for use with Galaxy | |
| 4 # written by Bjoern Gruening and modified by Michael Love 2016.03.30 | |
| 5 # | |
| 6 # one of these arguments is required: | |
| 7 # | |
| 8 # 'factors' a JSON list object from Galaxy | |
| 9 # | |
| 10 # 'sample_table' is a sample table as described in ?DESeqDataSetFromHTSeq | |
| 11 # with columns: sample name, filename, then factors (variables) | |
| 12 # | |
| 13 # the output file has columns: | |
| 14 # | |
| 15 # baseMean (mean normalized count) | |
| 16 # log2FoldChange (by default a moderated LFC estimate) | |
| 17 # lfcSE (the standard error) | |
| 18 # stat (the Wald statistic) | |
| 19 # pvalue (p-value from comparison of Wald statistic to a standard Normal) | |
| 20 # padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter) | |
| 21 # | |
| 22 # the first variable in 'factors' and first column in 'sample_table' will be the primary factor. | |
| 23 # the levels of the primary factor are used in the order of appearance in factors or in sample_table. | |
| 24 # | |
| 25 # by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile' | |
| 26 # | |
| 27 # for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B, | |
| 28 # to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc. | |
| 29 # all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages. | |
| 30 # | |
| 31 # fit_type is an integer valued argument, with the options from ?estimateDisperions | |
| 32 # 1 "parametric" | |
| 33 # 2 "local" | |
| 34 # 3 "mean" | |
| 35 | |
| 36 # setup R error handling to go to stderr | |
| 37 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
| 38 | |
| 39 # we need that to not crash galaxy with an UTF8 error on German LC settings. | |
| 40 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
| 41 | |
| 42 library("getopt") | |
| 43 library("tools") | |
| 44 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | |
| 45 args <- commandArgs(trailingOnly = TRUE) | |
| 46 | |
| 47 # get options, using the spec as defined by the enclosed list. | |
| 48 # we read the options from the default: commandArgs(TRUE). | |
| 49 spec <- matrix(c( | |
| 50 "quiet", "q", 0, "logical", | |
| 51 "help", "h", 0, "logical", | |
| 52 "outfile", "o", 1, "character", | |
| 53 "countsfile", "n", 1, "character", | |
| 54 "factors", "f", 1, "character", | |
| 55 "plots" , "p", 1, "character", | |
| 56 "sample_table", "s", 1, "character", | |
| 57 "tximport", "i", 0, "logical", | |
| 58 "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) | |
| 59 "fit_type", "t", 1, "integer", | |
| 60 "many_contrasts", "m", 0, "logical", | |
| 61 "outlier_replace_off" , "a", 0, "logical", | |
| 62 "outlier_filter_off" , "b", 0, "logical", | |
| 63 "auto_mean_filter_off", "c", 0, "logical", | |
| 64 "beta_prior_off", "d", 0, "logical"), | |
| 65 byrow=TRUE, ncol=4) | |
| 66 opt <- getopt(spec) | |
| 67 | |
| 68 # if help was asked for print a friendly message | |
| 69 # and exit with a non-zero error code | |
| 70 if (!is.null(opt$help)) { | |
| 71 cat(getopt(spec, usage=TRUE)) | |
| 72 q(status=1) | |
| 73 } | |
| 74 | |
| 75 # enforce the following required arguments | |
| 76 if (is.null(opt$outfile)) { | |
| 77 cat("'outfile' is required\n") | |
| 78 q(status=1) | |
| 79 } | |
| 80 if (is.null(opt$sample_table) & is.null(opt$factors)) { | |
| 81 cat("'factors' or 'sample_table' is required\n") | |
| 82 q(status=1) | |
| 83 } | |
| 84 | |
| 85 verbose <- if (is.null(opt$quiet)) { | |
| 86 TRUE | |
| 87 } else { | |
| 88 FALSE | |
| 89 } | |
| 90 | |
| 91 if (!is.null(opt$tximport)) { | |
| 92 if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport") | |
| 93 if (tolower(file_ext(opt$tx2gene)) == "gtf") { | |
| 94 gtfFile <- opt$tx2gene | |
| 95 } else { | |
| 96 gtfFile <- NULL | |
| 97 tx2gene <- read.table(opt$tx2gene, header=FALSE) | |
| 98 } | |
| 99 useTXI <- TRUE | |
| 100 } else { | |
| 101 useTXI <- FALSE | |
| 102 } | |
| 103 | |
| 104 suppressPackageStartupMessages({ | |
| 105 library("DESeq2") | |
| 106 library("RColorBrewer") | |
| 107 library("gplots") | |
| 108 }) | |
| 109 | |
| 110 # build or read sample table | |
| 111 | |
| 112 trim <- function (x) gsub("^\\s+|\\s+$", "", x) | |
| 113 | |
| 114 # switch on if 'factors' was provided: | |
| 115 if (!is.null(opt$factors)) { | |
| 116 library("rjson") | |
| 117 parser <- newJSONParser() | |
| 118 parser$addData(opt$factors) | |
| 119 factorList <- parser$getObject() | |
| 120 factors <- sapply(factorList, function(x) x[[1]]) | |
| 121 primaryFactor <- factors[1] | |
| 122 filenamesIn <- unname(unlist(factorList[[1]][[2]])) | |
| 123 sampleTable <- data.frame(sample=basename(filenamesIn), | |
| 124 filename=filenamesIn, | |
| 125 row.names=filenamesIn, | |
| 126 stringsAsFactors=FALSE) | |
| 127 for (factor in factorList) { | |
| 128 factorName <- trim(factor[[1]]) | |
| 129 sampleTable[[factorName]] <- character(nrow(sampleTable)) | |
| 130 lvls <- sapply(factor[[2]], function(x) names(x)) | |
| 131 for (i in seq_along(factor[[2]])) { | |
| 132 files <- factor[[2]][[i]][[1]] | |
| 133 sampleTable[files,factorName] <- trim(lvls[i]) | |
| 134 } | |
| 135 sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) | |
| 136 } | |
| 137 rownames(sampleTable) <- sampleTable$sample | |
| 138 } else { | |
| 139 # read the sample_table argument | |
| 140 # this table is described in ?DESeqDataSet | |
| 141 # one column for the sample name, one for the filename, and | |
| 142 # the remaining columns for factors in the analysis | |
| 143 sampleTable <- read.delim(opt$sample_table, stringsAsFactors=FALSE) | |
| 144 factors <- colnames(sampleTable)[-c(1:2)] | |
| 145 for (factor in factors) { | |
| 146 lvls <- unique(as.character(sampleTable[[factor]])) | |
| 147 sampleTable[[factor]] <- factor(sampleTable[[factor]], levels=lvls) | |
| 148 } | |
| 149 } | |
| 150 | |
| 151 primaryFactor <- factors[1] | |
| 152 designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + "))) | |
| 153 | |
| 154 if (verbose) { | |
| 155 cat("DESeq2 run information\n\n") | |
| 156 cat("sample table:\n") | |
| 157 print(sampleTable[,-c(1:2),drop=FALSE]) | |
| 158 cat("\ndesign formula:\n") | |
| 159 print(designFormula) | |
| 160 cat("\n\n") | |
| 161 } | |
| 162 | |
| 163 # these are plots which are made once for each analysis | |
| 164 generateGenericPlots <- function(dds, factors) { | |
| 165 rld <- rlog(dds) | |
| 166 d=plotPCA(rld, intgroup=rev(factors), returnData=TRUE) | |
| 167 labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors]))) | |
| 168 library(ggplot2) | |
| 169 print(ggplot(d, aes(x=PC1,y=PC2, col=group,label=factor(labs)), environment = environment()) + geom_point() + geom_text(size=3)) | |
| 170 dat <- assay(rld) | |
| 171 colnames(dat) <- labs | |
| 172 distsRL <- dist(t(dat)) | |
| 173 mat <- as.matrix(distsRL) | |
| 174 hc <- hclust(distsRL) | |
| 175 hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) | |
| 176 heatmap.2(mat, Rowv=as.dendrogram(hc), symm=TRUE, trace="none", col = rev(hmcol), | |
| 177 main="Sample-to-sample distances", margin=c(13,13)) | |
| 178 plotDispEsts(dds, main="Dispersion estimates") | |
| 179 } | |
| 180 | |
| 181 # these are plots which can be made for each comparison, e.g. | |
| 182 # once for C vs A and once for B vs A | |
| 183 generateSpecificPlots <- function(res, threshold, title_suffix) { | |
| 184 use <- res$baseMean > threshold | |
| 185 if (sum(!use) == 0) { | |
| 186 h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE) | |
| 187 barplot(height = h$counts, | |
| 188 col = "powderblue", space = 0, xlab="p-values", ylab="frequency", | |
| 189 main=paste("Histogram of p-values for",title_suffix)) | |
| 190 text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) | |
| 191 } else { | |
| 192 h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) | |
| 193 h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) | |
| 194 colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue") | |
| 195 barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, | |
| 196 col = colori, space = 0, xlab="p-values", ylab="frequency", | |
| 197 main=paste("Histogram of p-values for",title_suffix)) | |
| 198 text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA) | |
| 199 legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white") | |
| 200 } | |
| 201 plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE)) | |
| 202 } | |
| 203 | |
| 204 if (verbose) { | |
| 205 cat(paste("primary factor:",primaryFactor,"\n")) | |
| 206 if (length(factors) > 1) { | |
| 207 cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n")) | |
| 208 } | |
| 209 cat("\n---------------------\n") | |
| 210 } | |
| 211 | |
| 212 # if JSON input from Galaxy, path is absolute | |
| 213 # otherwise, from sample_table, assume it is relative | |
| 214 dir <- if (is.null(opt$factors)) { | |
| 215 "." | |
| 216 } else { | |
| 217 "" | |
| 218 } | |
| 219 | |
| 220 if (!useTXI) { | |
| 221 # construct the object from HTSeq files | |
| 222 dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, | |
| 223 directory = dir, | |
| 224 design = designFormula) | |
| 225 } else { | |
| 226 # construct the object using tximport | |
| 227 # first need to make the tx2gene table | |
| 228 # this takes ~2-3 minutes using Bioconductor functions | |
| 229 if (!is.null(gtfFile)) { | |
| 230 suppressPackageStartupMessages({ | |
| 231 library("GenomicFeatures") | |
| 232 }) | |
| 233 txdb <- makeTxDbFromGFF(gtfFile, format="gtf") | |
| 234 k <- keys(txdb, keytype = "GENEID") | |
| 235 df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME") | |
| 236 tx2gene <- df[, 2:1] # tx ID, then gene ID | |
| 237 } | |
| 238 library("tximport") | |
| 239 txiFiles <- as.character(sampleTable[,2]) | |
| 240 names(txiFiles) <- as.character(sampleTable[,1]) | |
| 241 txi <- tximport(txiFiles, type="sailfish", tx2gene=tx2gene) | |
| 242 dds <- DESeqDataSetFromTximport(txi, | |
| 243 sampleTable[,3:ncol(sampleTable),drop=FALSE], | |
| 244 designFormula) | |
| 245 } | |
| 246 | |
| 247 if (verbose) cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n")) | |
| 248 # optional outlier behavior | |
| 249 if (is.null(opt$outlier_replace_off)) { | |
| 250 minRep <- 7 | |
| 251 } else { | |
| 252 minRep <- Inf | |
| 253 if (verbose) cat("outlier replacement off\n") | |
| 254 } | |
| 255 if (is.null(opt$outlier_filter_off)) { | |
| 256 cooksCutoff <- TRUE | |
| 257 } else { | |
| 258 cooksCutoff <- FALSE | |
| 259 if (verbose) cat("outlier filtering off\n") | |
| 260 } | |
| 261 | |
| 262 # optional automatic mean filtering | |
| 263 if (is.null(opt$auto_mean_filter_off)) { | |
| 264 independentFiltering <- TRUE | |
| 265 } else { | |
| 266 independentFiltering <- FALSE | |
| 267 if (verbose) cat("automatic filtering on the mean off\n") | |
| 268 } | |
| 269 | |
| 270 # shrinkage of LFCs | |
| 271 if (is.null(opt$beta_prior_off)) { | |
| 272 betaPrior <- TRUE | |
| 273 } else { | |
| 274 betaPrior <- FALSE | |
| 275 if (verbose) cat("beta prior off\n") | |
| 276 } | |
| 277 | |
| 278 # dispersion fit type | |
| 279 if (is.null(opt$fit_type)) { | |
| 280 fitType <- "parametric" | |
| 281 } else { | |
| 282 fitType <- c("parametric","local","mean")[opt$fit_type] | |
| 283 } | |
| 284 | |
| 285 if (verbose) cat(paste("using disperion fit type:",fitType,"\n")) | |
| 286 | |
| 287 # run the analysis | |
| 288 dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep) | |
| 289 | |
| 290 # create the generic plots and leave the device open | |
| 291 if (!is.null(opt$plots)) { | |
| 292 if (verbose) cat("creating plots\n") | |
| 293 pdf(opt$plots) | |
| 294 generateGenericPlots(dds, factors) | |
| 295 } | |
| 296 | |
| 297 n <- nlevels(colData(dds)[[primaryFactor]]) | |
| 298 allLevels <- levels(colData(dds)[[primaryFactor]]) | |
| 299 | |
| 300 if (!is.null(opt$countsfile)) { | |
| 301 labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors]))) | |
| 302 normalizedCounts<-counts(dds,normalized=TRUE) | |
| 303 colnames(normalizedCounts)<-labs | |
| 304 write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE) | |
| 305 } | |
| 306 | |
| 307 if (is.null(opt$many_contrasts)) { | |
| 308 # only contrast the first and second level of the primary factor | |
| 309 ref <- allLevels[1] | |
| 310 lvl <- allLevels[2] | |
| 311 res <- results(dds, contrast=c(primaryFactor, lvl, ref), | |
| 312 cooksCutoff=cooksCutoff, | |
| 313 independentFiltering=independentFiltering) | |
| 314 if (verbose) { | |
| 315 cat("summary of results\n") | |
| 316 cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n")) | |
| 317 print(summary(res)) | |
| 318 } | |
| 319 resSorted <- res[order(res$padj),] | |
| 320 outDF <- as.data.frame(resSorted) | |
| 321 outDF$geneID <- rownames(outDF) | |
| 322 outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] | |
| 323 filename <- opt$outfile | |
| 324 write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) | |
| 325 if (independentFiltering) { | |
| 326 threshold <- unname(attr(res, "filterThreshold")) | |
| 327 } else { | |
| 328 threshold <- 0 | |
| 329 } | |
| 330 title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) | |
| 331 if (!is.null(opt$plots)) { | |
| 332 generateSpecificPlots(res, threshold, title_suffix) | |
| 333 } | |
| 334 } else { | |
| 335 # rotate through the possible contrasts of the primary factor | |
| 336 # write out a sorted table of results with the contrast as a suffix | |
| 337 # add contrast specific plots to the device | |
| 338 for (i in seq_len(n-1)) { | |
| 339 ref <- allLevels[i] | |
| 340 contrastLevels <- allLevels[(i+1):n] | |
| 341 for (lvl in contrastLevels) { | |
| 342 res <- results(dds, contrast=c(primaryFactor, lvl, ref), | |
| 343 cooksCutoff=cooksCutoff, | |
| 344 independentFiltering=independentFiltering) | |
| 345 resSorted <- res[order(res$padj),] | |
| 346 outDF <- as.data.frame(resSorted) | |
| 347 outDF$geneID <- rownames(outDF) | |
| 348 outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")] | |
| 349 filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref) | |
| 350 write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE) | |
| 351 if (independentFiltering) { | |
| 352 threshold <- unname(attr(res, "filterThreshold")) | |
| 353 } else { | |
| 354 threshold <- 0 | |
| 355 } | |
| 356 title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref) | |
| 357 if (!is.null(opt$plots)) { | |
| 358 generateSpecificPlots(res, threshold, title_suffix) | |
| 359 } | |
| 360 } | |
| 361 } | |
| 362 } | |
| 363 | |
| 364 # close the plot device | |
| 365 if (!is.null(opt$plots)) { | |
| 366 cat("closing plot device\n") | |
| 367 dev.off() | |
| 368 } | |
| 369 | |
| 370 cat("Session information:\n\n") | |
| 371 | |
| 372 sessionInfo() | |
| 373 |
