# HG changeset patch # User proteore # Date 1550584045 18000 # Node ID 0014bd289aff8bd50dd685cae4806b086cdbdba3 # Parent 36c97ab06d407ecc7dd39504a0d0d7ee3b3e71fa planemo upload commit 4cbd26b257f8a32e11289e28135da86b03b46622-dirty diff -r 36c97ab06d40 -r 0014bd289aff README.rst --- a/README.rst Mon Dec 17 11:05:04 2018 -0500 +++ b/README.rst Tue Feb 19 08:47:25 2019 -0500 @@ -7,7 +7,7 @@ **Galaxy integration** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Lisa Perus, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform diff -r 36c97ab06d40 -r 0014bd289aff topGO.xml --- a/topGO.xml Mon Dec 17 11:05:04 2018 -0500 +++ b/topGO.xml Tue Feb 19 08:47:25 2019 -0500 @@ -1,4 +1,4 @@ - + (Human, Mouse, Rat)[topGO] R @@ -69,7 +69,9 @@ - + + [c]{0,1}[0-9]+ + @@ -96,7 +98,9 @@ - + + [c]{0,1}[0-9]+ + @@ -117,7 +121,7 @@ - + @@ -141,16 +145,16 @@ - - textoutput + + + + + + 'dotplot' in plot - barplot - - - - dotplot + 'barplot' in plot @@ -185,61 +189,55 @@ **Description** + This tool is based on R package topGO. topGO package provides tools for testing GO terms while accounting for the topology of the GO graph. Different test statistics and different methods for eliminating local similarities and dependencies between GO terms can be applied. + +This component computes the GO terms representativity of a gene list in one ontology category (Biological Process "BP", Cellular Component "CC", Molecular Function "MF"). This representativity is evaluated in comparison to the background list of all genes/proteins (of the selected species) associated associated with GO terms of the chosen category (BP,CC,MF). + +----- + **Input required** -This component works with Ensembl gene ids (e.g : ENSG0000013618). You can -copy/paste these identifiers or supply a tabular file (.csv, .tsv, .txt, .tab) -where there are contained. +This component works with Ensembl gene IDs (e.g : ENSG0000013618). You can copy/paste these identifiers or supply a tabular file (.csv, .tsv, .txt, .tab) +and then specifying the column number that contains the ENSG IDs. + +----- + +**Parameters** -**Principle** +"Species": "Species": the three available species are Homo sapiens, Mus musculus and Rattus norvegicus + +"GO terms category": select either Biogical Process (BP)(by default), Cellular Component (CC) or Molecular Function (MF) + +"Select the topGO parameter (see user doc)": topGO provides a classic Fisher test for evaluating which GO terms are over-represented in your gene/protein list; other methodologies are also provided (Elim, Weight01, Parentchild). For the merits of each option and their algorithmic descriptions, please refer to topGO manual: +https://bioconductor.org/packages/release/bioc/vignettes/topGO/inst/doc/topGO.pdf -This component provides the GO terms representativity of a gene list in one ontology category (Biological Process "BP", Cellular Component "CC", Molecular Function "MF"). This representativity is evaluated in comparison to the background list of all human genes associated associated with GO terms of the chosen category (BP,CC,MF). This background is given by the R package "org.Hs.eg.db", which is a genome wide association package for **human**. +"p-value threshold (e.g : 1e-3)": must be in the form of "1e-5" (i.e. 0.00001) + +"Multiple testing procedure (p-value adjustment): several FDR procedure for multiple testing and p-value adjustment are available: Holm, Hochberg +Hommel, Bonferroni, BH (Benjamini-Hochberg), BY (Benjamini-Yekutieli), FDR. Default is BH (most commonly used) + +----- **Output** -Three kind of outputs are available : a textual output, a barplot output and -a dotplot output. +Three outputs are available : a textual output, a barplot and/or a dotplot (set by default) graphical outputs. -*Textual output* : -The text output lists all the GO-terms that were found significant under the specified threshold. - - -The different fields are as follow : +*Textual output* -- Annotated : number of genes in org.Hs.eg.db which are annotated with the GO-term. - -- Significant : number of genes belonging to your input which are annotated with the GO-term. +The text output lists all the GO-terms that were found significantly enriched according to the specified threshold (p-value). -- Expected : show an estimate of the number of genes a node of size Annotated would have if the significant genes were to be randomly selected from the gene universe. - -- pvalues : pvalue obtained after the test - -- ( qvalues : additional column with adjusted pvalues ) +The different fields are as follow: - -**Tests** - -topGO provides a classic fisher test for evaluating if some GO terms are over-represented in your gene list, but other options are also provided (elim, weight01,parentchild). For the merits of each option and their algorithmic descriptions, please refer to topGO manual : -https://bioconductor.org/packages/release/bioc/vignettes/topGO/inst/doc/topGO.pdf +- Annotated : number of genes in the selected species that are annotated with the GO-term. -**Multiple testing corrections** - -Furthermore, the following corrections for multiple testing can also be applied : - -- holm +- Significant : number of genes belonging to your input annotated with the GO-term. -- hochberg - -- hommel - -- bonferroni +- Expected : represents the expected number of interesting genes mapped to the GO term if the interesting genes were randomly distributed over all GO terms. -- BH +- p-values : p-value obtained after the test -- BY - -- fdr +- ( q-values : additional column with adjusted pvalues ) ----- @@ -247,13 +245,17 @@ **Authors** -Alexa A and Rahnenfuhrer J (2016). topGO: Enrichment Analysis for Gene Ontology. R package version 2.30.0. +Alexa A, Rahnenführer J, Lengauer T. Improved scoring of functional groups from gene expression data by decorrelating GO graph structure. Bioinformatics. 2006. 22(13):1600-7. PubMed PMID: 16606683. + +----- + +.. class:: infomark **Galaxy integration** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Lisa Perus, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR -Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR This work has been partially funded through the French National Agency for Research (ANR) IFB project. diff -r 36c97ab06d40 -r 0014bd289aff topGO_enrichment.R --- a/topGO_enrichment.R Mon Dec 17 11:05:04 2018 -0500 +++ b/topGO_enrichment.R Tue Feb 19 08:47:25 2019 -0500 @@ -155,91 +155,47 @@ createDotPlot = function(data, onto){ - values = deleteInfChar(data$pvalues) - values = roundValues(values) - values = as.numeric(values) - - geneRatio = data$Significant/data$Annotated - goTerms = data$Term - count = data$Significant - - labely = paste("GO terms",onto,sep=" ") - ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point( ) + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n" ) - ggsave("dotplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") + values = deleteInfChar(data$pvalues) + values = roundValues(values) + values = as.numeric(values) + + geneRatio = data$Significant/data$Annotated + goTerms = data$Term + count = data$Significant + + labely = paste("GO terms",onto,sep=" ") + ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point( ) + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n" ) + ggsave("dotplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") } createBarPlot = function(data, onto){ - - values = deleteInfChar(data$pvalues) - values = roundValues(values) - values = as.numeric(values) - - goTerms = data$Term - count = data$Significant - - labely = paste("GO terms",onto,sep=" ") - ggplot(data, aes(x=goTerms, y=count,fill=values,scale(scale = 0.5))) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") - ggsave("barplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") + values = deleteInfChar(data$pvalues) + values = roundValues(values) + values = as.numeric(values) + + goTerms = data$Term + count = data$Significant + + labely = paste("GO terms",onto,sep=" ") + ggplot(data, aes(x=goTerms, y=count,fill=values,scale(scale = 0.5))) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") + ggsave("barplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") } # Produce the different outputs createOutputs = function(result, cut_result,text, barplot, dotplot, onto){ - + if (is.null(result)){ - if (text){ - err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." - write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) - } - if (barplot){ - png(filename="barplot.png") - plot.new() - #text(0,0,err_msg) - dev.off() - } - if (dotplot){ - png(filename="dotplot.png") - plot.new() - #text(0,0,err_msg) - dev.off() - } - opt <- options(show.error.messages=FALSE) - on.exit(options(opt)) - stop("null result") - } + err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." + write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = F, row.names = F) + }else if (is.null(cut_result)){ + err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." + write.table(err_msg, file='result.tsv', quote=FALSE, sep='\t', col.names = F, row.names = F) + }else { + write.table(cut_result, file='result.tsv', quote=FALSE, sep='\t', col.names = T, row.names = F) - if (is.null(cut_result)){ - if (text){ - err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." - write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) - } - if (barplot){ - png(filename="barplot.png") - plot.new() - text(0,0,err_msg) - dev.off() - } - if (dotplot){ - png(filename="dotplot.png") - plot.new() - text(0,0,err_msg) - dev.off() - } - opt <- options(show.error.messages=FALSE) - on.exit(options(opt)) - stop("null cut_result") - } - - if (text){ - write.table(cut_result, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) - } - - if (barplot){ - createBarPlot(cut_result, onto) - } - - if (dotplot){ - createDotPlot(cut_result, onto) + if (barplot){createBarPlot(cut_result, onto)} + if (dotplot){createDotPlot(cut_result, onto)} } } @@ -318,8 +274,7 @@ #check of ENS ids if (! any(check_ens_ids(sample))){ - print("no ensembl gene ids found in your ids list, please check your IDs in input or the selected column of your input file") - stop() + stop("no ensembl gene ids found in your ids list, please check your IDs in input or the selected column of your input file") } #get input if background genes @@ -332,8 +287,7 @@ } #check of ENS ids if (! any(check_ens_ids(background_sample))){ - print("no ensembl gene ids found in your background ids list, please check your IDs in input or the selected column of your input file") - stop() + stop("no ensembl gene ids found in your background ids list, please check your IDs in input or the selected column of your input file") } } else { background_sample=NULL