Mercurial > repos > proteore > proteore_topgo

--- a/topGO.xml	Mon Oct 15 11:30:04 2018 -0400
+++ b/topGO.xml	Tue Oct 16 10:28:50 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="topGO" name="Enrichment analysis for Gene Ontology" version="2018.10.12">
+<tool id="topGO" name="Enrichment analysis for Gene Ontology" version="2018.10.16">
     <description>(Human, Mouse, Rat) (topGO)</description>
     <requirements>
         <requirement type="package" version="3.4.1">R</requirement>
@@ -36,6 +36,16 @@
   --barplotoutput='$barplot'
   --dotplotoutput='$dotplot'
   --geneuniverse='$geneuniverse'
+  --background="$background_genes.background"
+
+  #if $background_genes.background == "true"
+    --background_genes="$background_genes.inputtype.genelist"
+    --background_input_type="$background_genes.inputtype.filetype"
+    #if $background_genes.inputtype.filetype == "file"
+      --background_header="$background_genes.inputtype.header"
+      --background_column="$background_genes.inputtype.column"
+    #end if
+  #end if

     ]]></command>

@@ -63,6 +73,35 @@
       <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your file have a header?" />
     </when>
   </conditional>
+  <conditional name="background_genes">
+    <param name="background" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Would you like to define your own background IDs"/>
+    <when value="true">
+      <conditional name="inputtype">
+        <param name="filetype" type="select" label="Select your type of input file" help="The identifiers must be Ensembl gene IDs (e.g : ENSG00000139618). If it is not the case, please use the ID Mapping tool.">
+          <option value="file" selected="true">Input file containing your identifiers</option>
+          <option value="copy_paste">Copy/paste your list of IDs</option>
+        </param>
+        <when value="copy_paste">
+          <param name="genelist" type="text" label="Enter a list of identifiers">
+            <sanitizer>
+            <valid initial="string.printable">
+                <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+                <add source="&apos;" target="__sq__"/>
+            </mapping>
+            </sanitizer>
+          </param>
+        </when>
+        <when value="file">
+          <param name="genelist" type="data" format="txt,tabular" label="Choose an input file" help="This file must imperatively have 1 column filled with IDs consistent with the database that will be used. Please use the MappingIDs component if this is not the case."/>
+          <param name="column" type="text" label="Please specify the column where your Ensembl IDs are (e.g : Enter 'c1' for column n°1..)" value="c1"/>
+          <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does your file have a header?" />
+        </when>
+      </conditional>
+    </when>
+    <when value="false"/>
+  </conditional>
     <param name="geneuniverse" type="select" label="Select a specie">
       <!--option value="org.At.tair.db" >Arabidopsis</option-->
       <option value="org.Ce.eg.db" >Worm (C. elegans)</option>
--- a/topGO_enrichment.R	Mon Oct 15 11:30:04 2018 -0400
+++ b/topGO_enrichment.R	Tue Oct 16 10:28:50 2018 -0400
@@ -47,7 +47,7 @@
 }

 read_file <- function(path,header){
-  file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote=""),silent=TRUE)
+  file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\""),silent=TRUE)
   if (inherits(file,"try-error")){
     stop("File not found !")
   }else{
@@ -55,6 +55,14 @@
   }
 }

+get_list_from_cp <-function(list){
+  list = gsub(";"," ",list)
+  list = strsplit(list, "[ \t\n]+")[[1]]
+  list = list[list != ""]    #remove empty entry
+  list = gsub("-.+", "", list)  #Remove isoform accession number (e.g. "-2")
+  return(list)
+}
+
 check_ens_ids <- function(vector) {
   ens_pattern = "^(ENS[A-Z]+[0-9]{11}|[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?|CG[0-9]+|[A-Z0-9]+\\.[0-9]+|YM[A-Z][0-9]{3}[a-z][0-9])$"
   return(grepl(ens_pattern,vector))
@@ -238,23 +246,26 @@

 # Launch enrichment analysis and return result data from the analysis or the null
 # object if the enrichment could not be done.
-goEnrichment = function(geneuniverse,sample,onto){
+goEnrichment = function(geneuniverse,sample,background_sample,onto){

-  # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their
-  # associated ensembl ids according to the org package
-  xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl")
-  allGenes = unique(unlist(xx))
-  # check if the genes given by the user can be found in the org package (gene
-  # universe), that is in
-  # allGenes
+  if (is.null(background_sample)){
+    xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl")   # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their associated ensembl ids according to the org package
+    allGenes = unique(unlist(xx))                             # check if the genes given by the user can be found in the org package (gene universe), that is in allGenes
+  } else {
+    allGenes = background_sample
+  }
+
   if (length(intersect(sample,allGenes))==0){
-
     print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.")
     return(c(NULL,NULL))
-
   }

   geneList = factor(as.integer(allGenes %in% sample))
+  if (levels(geneList) == "1" ){
+    stop("All background genes are found in tested genes dataset, enrichment analysis can't be done")
+  } else if (levels(geneList)== "0"){
+    stop("None of the background genes are found in tested genes dataset, enrichment analysis can't be done")
+  }
   names(geneList) <- allGenes

   #topGO enrichment
@@ -292,13 +303,20 @@
 column = as.numeric(gsub("c","",args$column))
 geneuniverse = args$geneuniverse
 header = str2bool(args$header)
+background = str2bool(args$background)
+if (background){
+  background_genes = args$background_genes
+  background_input_type = args$background_input_type
+  background_header = str2bool(args$background_header)
+  background_column = as.numeric(gsub("c","",args$background_column))
+}

 #get input
 if (input_type=="copy_paste"){
-  sample <- unlist(strsplit(input,","))
+  sample <- get_list_from_cp(input)
 } else if (input_type=="file"){
   tab=read_file(input,header)
-  sample = tab[,column]
+  sample = trimws(unlist(strsplit(tab[,column],";")))
 }

 #check of ENS ids
@@ -307,8 +325,26 @@
   stop()
 }

+#get input if background genes
+if (background){
+  if (background_input_type=="copy_paste"){
+    background_sample <- get_list_from_cp(background_genes)
+  } else if (background_input_type=="file"){
+    background_tab=read_file(background_genes,background_header)
+    background_sample = unique(trimws(unlist(strsplit(background_tab[,background_column],";"))))
+  }
+} else {
+  background_sample=NULL
+}
+
+#check of ENS ids
+if (! any(check_ens_ids(background_sample))){
+  print("no ensembl gene ids found in your background ids list, please check your IDs in input or the selected column of your input file")
+  stop()
+}
+
 # Launch enrichment analysis
-allresult = suppressMessages(goEnrichment(geneuniverse,sample,onto))
+allresult = suppressMessages(goEnrichment(geneuniverse,sample,background_sample,onto))
 result = allresult[1][[1]]
 myGOdata = allresult[2][[1]]
 if (!is.null(result)){