comparison goseq.r @ 5:b79c65c90744 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 81aedf1b50849160f6c048c0da4bb1038bb813a5
author mvdbeek
date Sun, 28 Feb 2016 11:52:10 -0500
parents 76eab486aba9
children 0e9424413ab0
comparison
equal deleted inserted replaced
4:76eab486aba9 5:b79c65c90744
13 make_option(c("-c", "--cutoff"), type="double",dest="p_adj_cutoff", 13 make_option(c("-c", "--cutoff"), type="double",dest="p_adj_cutoff",
14 help="Genes with p.adjust below cutoff are considered not differentially expressed and serve as control genes"), 14 help="Genes with p.adjust below cutoff are considered not differentially expressed and serve as control genes"),
15 make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"), 15 make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"),
16 make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"), 16 make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"),
17 make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up correct gene length]"), 17 make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up correct gene length]"),
18 make_option(c("-i", "--gene_id"), type="character", help="Gene ID of gene column in DGE file") 18 make_option(c("-i", "--gene_id"), type="character", help="Gene ID of gene column in DGE file"),
19 make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical", help="A boolean to indicate whether genes without a categorie should still be used. For example, a large number of gene may have no GO term annotated. If thisoption is set to FALSE, those genes will be ignored in the calculation of p-values(default behaviour). If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested (default behaviour prior to version 1.15.2)."
20 )
19 ) 21 )
20 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 22 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
21 args = parse_args(parser) 23 args = parse_args(parser)
22 24
23 # Vars: 25 # Vars:
31 sampling_tab = args$sampling_tab 33 sampling_tab = args$sampling_tab
32 nobias_tab = args$nobias_tab 34 nobias_tab = args$nobias_tab
33 length_bias_plot = args$length_bias_plot 35 length_bias_plot = args$length_bias_plot
34 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot 36 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot
35 repcnt = args$repcnt 37 repcnt = args$repcnt
38 use_genes_without_cat = args$use_genes_without_cat
36 39
37 # format DE genes into vector suitable for use with goseq 40 # format DE genes into vector suitable for use with goseq
38 dge_table = read.delim(dge_file, header = TRUE, sep="\t", check.names = FALSE) 41 dge_table = read.delim(dge_file, header = TRUE, sep="\t", check.names = FALSE)
39 genes = as.integer(dge_table[,p_adj_column]<p_adj_cutoff) 42 genes = as.integer(dge_table[,p_adj_column]<p_adj_cutoff)
40 names(genes) = dge_table[,1] # Assuming first row contains gene names 43 names(genes) = dge_table[,1] # Assuming first row contains gene names
51 # Estimate PWF 54 # Estimate PWF
52 55
53 pdf(length_bias_plot) 56 pdf(length_bias_plot)
54 pwf=nullp(genes, genome, gene_id, gene_lengths) 57 pwf=nullp(genes, genome, gene_id, gene_lengths)
55 message = dev.off() 58 message = dev.off()
59
60 # Fetch GO annotations:
61 go_map=getgo(names(genes), genome, gene_id, fetch.cats=c("GO:CC", "GO:BP", "GO:MF", "KEGG"))
62
56 # wallenius approximation of p-values 63 # wallenius approximation of p-values
57 GO.wall=goseq(pwf, genome, gene_id) 64 GO.wall=goseq(pwf, genome, gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
58 65
59 GO.nobias=goseq(pwf, genome, gene_id, method="Hypergeometric") 66 GO.nobias=goseq(pwf, genome, gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
60 67
61 # Sampling distribution 68 # Sampling distribution
62 if (repcnt > 0) { 69 if (repcnt > 0) {
63 GO.samp=goseq(pwf,genome, gene_id, method="Sampling", repcnt=repcnt) 70 GO.samp=goseq(pwf,genome, gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
64 # Compare sampling with wallenius 71 # Compare sampling with wallenius
65 pdf(sample_vs_wallenius_plot) 72 pdf(sample_vs_wallenius_plot)
66 plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]), 73 plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]),
67 xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)", 74 xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
68 xlim=c(-3,0)) 75 xlim=c(-3,0))