Mercurial > repos > mvdbeek > r_goseq_1_22_0
comparison goseq.r @ 6:0e9424413ab0 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
author | mvdbeek |
---|---|
date | Thu, 03 Mar 2016 09:56:51 -0500 |
parents | b79c65c90744 |
children | 9ffae7bc23c2 |
comparison
equal
deleted
inserted
replaced
5:b79c65c90744 | 6:0e9424413ab0 |
---|---|
1 sink(stdout(), type = "message") | 1 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) |
2 suppressWarnings(suppressMessages(library(goseq))) | 2 |
3 suppressWarnings(suppressMessages(library(optparse))) | 3 # we need that to not crash galaxy with an UTF8 error on German LC settings. |
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
5 | |
6 suppressPackageStartupMessages({ | |
7 library("goseq") | |
8 library("optparse") | |
9 }) | |
4 | 10 |
5 option_list <- list( | 11 option_list <- list( |
6 make_option(c("-d", "--dge_file"), type="character", help="Path to file with differential gene expression result"), | 12 make_option(c("-d", "--dge_file"), type="character", help="Path to file with differential gene expression result"), |
7 make_option(c("-w","--wallenius_tab"), type="character", help="Path to output file with P-values estimated using wallenius distribution."), | 13 make_option(c("-w","--wallenius_tab"), type="character", help="Path to output file with P-values estimated using wallenius distribution."), |
8 make_option(c("-s","--sampling_tab"), type="character", default=FALSE, help="Path to output file with P-values estimated using wallenius distribution."), | 14 make_option(c("-s","--sampling_tab"), type="character", default=FALSE, help="Path to output file with P-values estimated using wallenius distribution."), |
12 make_option(c("-padj", "--p_adj_column"), type="integer",help="Column that contains p. adjust values"), | 18 make_option(c("-padj", "--p_adj_column"), type="integer",help="Column that contains p. adjust values"), |
13 make_option(c("-c", "--cutoff"), type="double",dest="p_adj_cutoff", | 19 make_option(c("-c", "--cutoff"), type="double",dest="p_adj_cutoff", |
14 help="Genes with p.adjust below cutoff are considered not differentially expressed and serve as control genes"), | 20 help="Genes with p.adjust below cutoff are considered not differentially expressed and serve as control genes"), |
15 make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"), | 21 make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"), |
16 make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"), | 22 make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"), |
17 make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up correct gene length]"), | 23 make_option(c("-cat_file", "--category_file"), default="FALSE", type="character", help = "Path to tabular file with gene_id <-> category mapping."), |
18 make_option(c("-i", "--gene_id"), type="character", help="Gene ID of gene column in DGE file"), | 24 make_option(c("-g", "--genome"), default=NULL, type="character", help = "Genome [used for looking up correct gene length]"), |
19 make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical", help="A boolean to indicate whether genes without a categorie should still be used. For example, a large number of gene may have no GO term annotated. If thisoption is set to FALSE, those genes will be ignored in the calculation of p-values(default behaviour). If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested (default behaviour prior to version 1.15.2)." | 25 make_option(c("-i", "--gene_id"), default=NULL, type="character", help = "Gene ID format of genes in DGE file"), |
20 ) | 26 make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical", |
21 ) | 27 help="A large number of gene may have no GO term annotated. If this option is set to FALSE, genes without category will be ignored in the calculation of p-values(default behaviour). If TRUE these genes will count towards the total number of genes outside the tested category (default behaviour prior to version 1.15.2)."), |
28 make_option(c("-plots", "--make_plots"), default=FALSE, type="logical", help="produce diagnostic plots?") | |
29 ) | |
30 | |
22 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | 31 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) |
23 args = parse_args(parser) | 32 args = parse_args(parser) |
24 | 33 |
25 # Vars: | 34 # Vars: |
26 dge_file = args$dge_file | 35 dge_file = args$dge_file |
36 category_file = args$category_file | |
27 p_adj_column = args$p_adj_colum | 37 p_adj_column = args$p_adj_colum |
28 p_adj_cutoff = args$p_adj_cutoff | 38 p_adj_cutoff = args$p_adj_cutoff |
29 length_file = args$length_file | 39 length_file = args$length_file |
30 genome = args$genome | 40 genome = args$genome |
31 gene_id = args$gene_id | 41 gene_id = args$gene_id |
34 nobias_tab = args$nobias_tab | 44 nobias_tab = args$nobias_tab |
35 length_bias_plot = args$length_bias_plot | 45 length_bias_plot = args$length_bias_plot |
36 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot | 46 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot |
37 repcnt = args$repcnt | 47 repcnt = args$repcnt |
38 use_genes_without_cat = args$use_genes_without_cat | 48 use_genes_without_cat = args$use_genes_without_cat |
49 make_plots = args$make_plots | |
39 | 50 |
40 # format DE genes into vector suitable for use with goseq | 51 # format DE genes into named vector suitable for goseq |
41 dge_table = read.delim(dge_file, header = TRUE, sep="\t", check.names = FALSE) | 52 first_line = read.delim(dge_file, header = FALSE, nrow=1) |
53 # check if header [character where numeric is expected] | |
54 if (is.numeric(first_line[,p_adj_column])) { | |
55 dge_table = read.delim(dge_file, header = FALSE, sep="\t") | |
56 } else { | |
57 dge_table = read.delim(dge_file, header = TRUE, sep="\t") | |
58 } | |
59 | |
42 genes = as.integer(dge_table[,p_adj_column]<p_adj_cutoff) | 60 genes = as.integer(dge_table[,p_adj_column]<p_adj_cutoff) |
43 names(genes) = dge_table[,1] # Assuming first row contains gene names | 61 names(genes) = dge_table[,1] # Assuming first row contains gene names |
44 | 62 |
45 # Get gene lengths | 63 # gene lengths, assuming last column |
46 if (length_file != "FALSE" ) { | 64 if (length_file != "FALSE" ) { |
47 length_table = read.delim(length_file, header=TRUE, sep="\t", check.names=FALSE) | 65 first_line = read.delim(dge_file, header = FALSE, nrow=1) |
66 if (is.numeric(first_line[, ncol(first_line)])) { | |
67 length_table = read.delim(length_file, header=FALSE, sep="\t", check.names=FALSE) | |
68 } else { | |
69 length_table = read.delim(length_file, header=TRUE, sep="\t", check.names=FALSE) | |
70 } | |
48 row.names(length_table) = length_table[,1] | 71 row.names(length_table) = length_table[,1] |
49 gene_lengths = length_table[names(genes),]$length | 72 gene_lengths = length_table[names(genes),][,ncol(length_table)] |
50 } else { | 73 } else { |
51 gene_lengths = getlength(names(genes), genome, gene_id) | 74 gene_lengths = getlength(names(genes), genome, gene_id) |
52 } | 75 } |
53 | 76 |
54 # Estimate PWF | 77 # Estimate PWF |
55 | 78 |
56 pdf(length_bias_plot) | 79 if (make_plots == TRUE) { |
57 pwf=nullp(genes, genome, gene_id, gene_lengths) | 80 pdf(length_bias_plot) |
58 message = dev.off() | 81 } |
82 pwf=nullp(genes, genome = genome, id = gene_id, bias.data = gene_lengths, plot.fit=make_plots) | |
83 graphics.off() | |
59 | 84 |
60 # Fetch GO annotations: | 85 # Fetch GO annotations if category_file hasn't been supplied: |
61 go_map=getgo(names(genes), genome, gene_id, fetch.cats=c("GO:CC", "GO:BP", "GO:MF", "KEGG")) | 86 if (category_file == "FALSE") { |
87 go_map=getgo(genes = names(genes), genome = genome, id = gene_id, fetch.cats=c("GO:CC", "GO:BP", "GO:MF", "KEGG")) | |
88 } else { | |
89 # check for header: first entry in first column must be present in genes, else it's a header | |
90 first_line = read.delim(category_file, header = FALSE, nrow=1) | |
91 if (first_line[,1] %in% names(genes)) { | |
92 go_map = read.delim(category_file, header = FALSE) | |
93 } else { | |
94 go_map = read.delim(category_file, header= TRUE) | |
95 } | |
96 } | |
62 | 97 |
63 # wallenius approximation of p-values | 98 # wallenius approximation of p-values |
64 GO.wall=goseq(pwf, genome, gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) | 99 GO.wall=goseq(pwf, genome = genome, id = gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) |
65 | 100 |
66 GO.nobias=goseq(pwf, genome, gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) | 101 GO.nobias=goseq(pwf, genome = genome, id = gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) |
67 | 102 |
68 # Sampling distribution | 103 # Sampling distribution |
69 if (repcnt > 0) { | 104 if (repcnt > 0) { |
70 GO.samp=goseq(pwf,genome, gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) | 105 GO.samp=goseq(pwf, genome = genome, id = gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map) |
71 # Compare sampling with wallenius | 106 # Compare sampling with wallenius |
107 if (make_plots == TRUE) { | |
72 pdf(sample_vs_wallenius_plot) | 108 pdf(sample_vs_wallenius_plot) |
73 plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]), | 109 plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]), |
74 xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)", | 110 xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)", |
75 xlim=c(-3,0)) | 111 xlim=c(-3,0)) |
76 abline(0,1,col=3,lty=2) | 112 abline(0,1,col=3,lty=2) |
77 message = dev.off() | 113 graphics.off() |
114 } | |
78 write.table(GO.samp, sampling_tab, sep="\t", row.names = FALSE, quote = FALSE) | 115 write.table(GO.samp, sampling_tab, sep="\t", row.names = FALSE, quote = FALSE) |
79 } | 116 } |
80 | 117 |
81 | 118 |
82 write.table(GO.wall, wallenius_tab, sep="\t", row.names = FALSE, quote = FALSE) | 119 write.table(GO.wall, wallenius_tab, sep="\t", row.names = FALSE, quote = FALSE) |