comparison goseq.r @ 6:0e9424413ab0 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
author mvdbeek
date Thu, 03 Mar 2016 09:56:51 -0500
parents b79c65c90744
children 9ffae7bc23c2
comparison
equal deleted inserted replaced
5:b79c65c90744 6:0e9424413ab0
1 sink(stdout(), type = "message") 1 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
2 suppressWarnings(suppressMessages(library(goseq))) 2
3 suppressWarnings(suppressMessages(library(optparse))) 3 # we need that to not crash galaxy with an UTF8 error on German LC settings.
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
5
6 suppressPackageStartupMessages({
7 library("goseq")
8 library("optparse")
9 })
4 10
5 option_list <- list( 11 option_list <- list(
6 make_option(c("-d", "--dge_file"), type="character", help="Path to file with differential gene expression result"), 12 make_option(c("-d", "--dge_file"), type="character", help="Path to file with differential gene expression result"),
7 make_option(c("-w","--wallenius_tab"), type="character", help="Path to output file with P-values estimated using wallenius distribution."), 13 make_option(c("-w","--wallenius_tab"), type="character", help="Path to output file with P-values estimated using wallenius distribution."),
8 make_option(c("-s","--sampling_tab"), type="character", default=FALSE, help="Path to output file with P-values estimated using wallenius distribution."), 14 make_option(c("-s","--sampling_tab"), type="character", default=FALSE, help="Path to output file with P-values estimated using wallenius distribution."),
12 make_option(c("-padj", "--p_adj_column"), type="integer",help="Column that contains p. adjust values"), 18 make_option(c("-padj", "--p_adj_column"), type="integer",help="Column that contains p. adjust values"),
13 make_option(c("-c", "--cutoff"), type="double",dest="p_adj_cutoff", 19 make_option(c("-c", "--cutoff"), type="double",dest="p_adj_cutoff",
14 help="Genes with p.adjust below cutoff are considered not differentially expressed and serve as control genes"), 20 help="Genes with p.adjust below cutoff are considered not differentially expressed and serve as control genes"),
15 make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"), 21 make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"),
16 make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"), 22 make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"),
17 make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up correct gene length]"), 23 make_option(c("-cat_file", "--category_file"), default="FALSE", type="character", help = "Path to tabular file with gene_id <-> category mapping."),
18 make_option(c("-i", "--gene_id"), type="character", help="Gene ID of gene column in DGE file"), 24 make_option(c("-g", "--genome"), default=NULL, type="character", help = "Genome [used for looking up correct gene length]"),
19 make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical", help="A boolean to indicate whether genes without a categorie should still be used. For example, a large number of gene may have no GO term annotated. If thisoption is set to FALSE, those genes will be ignored in the calculation of p-values(default behaviour). If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested (default behaviour prior to version 1.15.2)." 25 make_option(c("-i", "--gene_id"), default=NULL, type="character", help = "Gene ID format of genes in DGE file"),
20 ) 26 make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical",
21 ) 27 help="A large number of gene may have no GO term annotated. If this option is set to FALSE, genes without category will be ignored in the calculation of p-values(default behaviour). If TRUE these genes will count towards the total number of genes outside the tested category (default behaviour prior to version 1.15.2)."),
28 make_option(c("-plots", "--make_plots"), default=FALSE, type="logical", help="produce diagnostic plots?")
29 )
30
22 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 31 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
23 args = parse_args(parser) 32 args = parse_args(parser)
24 33
25 # Vars: 34 # Vars:
26 dge_file = args$dge_file 35 dge_file = args$dge_file
36 category_file = args$category_file
27 p_adj_column = args$p_adj_colum 37 p_adj_column = args$p_adj_colum
28 p_adj_cutoff = args$p_adj_cutoff 38 p_adj_cutoff = args$p_adj_cutoff
29 length_file = args$length_file 39 length_file = args$length_file
30 genome = args$genome 40 genome = args$genome
31 gene_id = args$gene_id 41 gene_id = args$gene_id
34 nobias_tab = args$nobias_tab 44 nobias_tab = args$nobias_tab
35 length_bias_plot = args$length_bias_plot 45 length_bias_plot = args$length_bias_plot
36 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot 46 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot
37 repcnt = args$repcnt 47 repcnt = args$repcnt
38 use_genes_without_cat = args$use_genes_without_cat 48 use_genes_without_cat = args$use_genes_without_cat
49 make_plots = args$make_plots
39 50
40 # format DE genes into vector suitable for use with goseq 51 # format DE genes into named vector suitable for goseq
41 dge_table = read.delim(dge_file, header = TRUE, sep="\t", check.names = FALSE) 52 first_line = read.delim(dge_file, header = FALSE, nrow=1)
53 # check if header [character where numeric is expected]
54 if (is.numeric(first_line[,p_adj_column])) {
55 dge_table = read.delim(dge_file, header = FALSE, sep="\t")
56 } else {
57 dge_table = read.delim(dge_file, header = TRUE, sep="\t")
58 }
59
42 genes = as.integer(dge_table[,p_adj_column]<p_adj_cutoff) 60 genes = as.integer(dge_table[,p_adj_column]<p_adj_cutoff)
43 names(genes) = dge_table[,1] # Assuming first row contains gene names 61 names(genes) = dge_table[,1] # Assuming first row contains gene names
44 62
45 # Get gene lengths 63 # gene lengths, assuming last column
46 if (length_file != "FALSE" ) { 64 if (length_file != "FALSE" ) {
47 length_table = read.delim(length_file, header=TRUE, sep="\t", check.names=FALSE) 65 first_line = read.delim(dge_file, header = FALSE, nrow=1)
66 if (is.numeric(first_line[, ncol(first_line)])) {
67 length_table = read.delim(length_file, header=FALSE, sep="\t", check.names=FALSE)
68 } else {
69 length_table = read.delim(length_file, header=TRUE, sep="\t", check.names=FALSE)
70 }
48 row.names(length_table) = length_table[,1] 71 row.names(length_table) = length_table[,1]
49 gene_lengths = length_table[names(genes),]$length 72 gene_lengths = length_table[names(genes),][,ncol(length_table)]
50 } else { 73 } else {
51 gene_lengths = getlength(names(genes), genome, gene_id) 74 gene_lengths = getlength(names(genes), genome, gene_id)
52 } 75 }
53 76
54 # Estimate PWF 77 # Estimate PWF
55 78
56 pdf(length_bias_plot) 79 if (make_plots == TRUE) {
57 pwf=nullp(genes, genome, gene_id, gene_lengths) 80 pdf(length_bias_plot)
58 message = dev.off() 81 }
82 pwf=nullp(genes, genome = genome, id = gene_id, bias.data = gene_lengths, plot.fit=make_plots)
83 graphics.off()
59 84
60 # Fetch GO annotations: 85 # Fetch GO annotations if category_file hasn't been supplied:
61 go_map=getgo(names(genes), genome, gene_id, fetch.cats=c("GO:CC", "GO:BP", "GO:MF", "KEGG")) 86 if (category_file == "FALSE") {
87 go_map=getgo(genes = names(genes), genome = genome, id = gene_id, fetch.cats=c("GO:CC", "GO:BP", "GO:MF", "KEGG"))
88 } else {
89 # check for header: first entry in first column must be present in genes, else it's a header
90 first_line = read.delim(category_file, header = FALSE, nrow=1)
91 if (first_line[,1] %in% names(genes)) {
92 go_map = read.delim(category_file, header = FALSE)
93 } else {
94 go_map = read.delim(category_file, header= TRUE)
95 }
96 }
62 97
63 # wallenius approximation of p-values 98 # wallenius approximation of p-values
64 GO.wall=goseq(pwf, genome, gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) 99 GO.wall=goseq(pwf, genome = genome, id = gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
65 100
66 GO.nobias=goseq(pwf, genome, gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) 101 GO.nobias=goseq(pwf, genome = genome, id = gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
67 102
68 # Sampling distribution 103 # Sampling distribution
69 if (repcnt > 0) { 104 if (repcnt > 0) {
70 GO.samp=goseq(pwf,genome, gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) 105 GO.samp=goseq(pwf, genome = genome, id = gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
71 # Compare sampling with wallenius 106 # Compare sampling with wallenius
107 if (make_plots == TRUE) {
72 pdf(sample_vs_wallenius_plot) 108 pdf(sample_vs_wallenius_plot)
73 plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]), 109 plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]),
74 xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)", 110 xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
75 xlim=c(-3,0)) 111 xlim=c(-3,0))
76 abline(0,1,col=3,lty=2) 112 abline(0,1,col=3,lty=2)
77 message = dev.off() 113 graphics.off()
114 }
78 write.table(GO.samp, sampling_tab, sep="\t", row.names = FALSE, quote = FALSE) 115 write.table(GO.samp, sampling_tab, sep="\t", row.names = FALSE, quote = FALSE)
79 } 116 }
80 117
81 118
82 write.table(GO.wall, wallenius_tab, sep="\t", row.names = FALSE, quote = FALSE) 119 write.table(GO.wall, wallenius_tab, sep="\t", row.names = FALSE, quote = FALSE)