comparison GO-enrich.R @ 42:3a990910e7f4 draft default tip

"planemo upload commit be9070db4a3b178ab45ecd9c9c3ab369240f7617-dirty"
author proteore
date Fri, 09 Apr 2021 14:38:05 +0000
parents 3fd1ccc57a6a
children
comparison
equal deleted inserted replaced
41:3fd1ccc57a6a 42:3a990910e7f4
1 options(warn=-1) #TURN OFF WARNINGS !!!!!! 1 options(warn = -1) #TURN OFF WARNINGS !!!!!!
2 suppressMessages(library(clusterProfiler,quietly = TRUE)) 2 suppressMessages(library(clusterProfiler, quietly = TRUE))
3 3
4 # Read file and return file content as data.frame 4 # Read file and return file content as data.frame
5 read_file <- function(path,header){ 5 read_file <- function(path, header) {
6 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="",check.names = F),silent=TRUE) 6 file <- try(read.csv(path, header = header, sep = "\t",
7 if (inherits(file,"try-error")){ 7 stringsAsFactors = FALSE, quote = "", check.names = F),
8 silent = TRUE)
9 if (inherits(file, "try-error")) {
8 stop("File not found !") 10 stop("File not found !")
9 }else{ 11 }else{
10 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] 12 file <- file[!apply(is.na(file) | file == "", 1, all), , drop = FALSE]
11 return(file) 13 return(file)
12 } 14 }
13 } 15 }
14 16
15 #return the number of character from the longest description found (from the 10 first) 17
16 max_str_length_10_first <- function(vector){ 18 #return the number of character from the longest description found
19 #(from the 10 first)
20 max_str_length_10_first <- function(vector) {
17 vector <- as.vector(vector) 21 vector <- as.vector(vector)
18 nb_description = length(vector) 22 nb_description <- length(vector)
19 if (nb_description >= 10){nb_description=10} 23 if (nb_description >= 10) {
24 nb_description <- 10
25 }
26
20 return(max(nchar(vector[1:nb_description]))) 27 return(max(nchar(vector[1:nb_description])))
21 } 28 }
22 29
23 str2bool <- function(x){ 30 str2bool <- function(x) {
24 if (any(is.element(c("t","true"),tolower(x)))){ 31 if (any(is.element(c("t", "true"), tolower(x)))) {
25 return (TRUE) 32 return(TRUE)
26 }else if (any(is.element(c("f","false"),tolower(x)))){ 33 }else if (any(is.element(c("f", "false"), tolower(x)))) {
27 return (FALSE) 34 return(FALSE)
35
28 }else{ 36 }else{
29 return(NULL) 37 return(NULL)
30 } 38 }
31 } 39 }
32 40
33 #used before the limit was set to 50 characters 41 #used before the limit was set to 50 characters
34 width_by_max_char <- function (nb_max_char) { 42 width_by_max_char <- function(nb_max_char) {
35 if (nb_max_char < 50 ){ 43 if (nb_max_char < 50) {
36 width=600 44 width <- 600
37 } else if (nb_max_char < 75) { 45 } else if (nb_max_char < 75) {
38 width=800 46 width <- 800
39 } else if (nb_max_char < 100) { 47 } else if (nb_max_char < 100) {
40 width=900 48 width <- 900
41 } else { 49 } else {
42 width=1000 50 width <- 1000
43 } 51 }
44 return (width) 52 return(width)
45 } 53 }
46 54
47 repartition_GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) { 55
48 ggo<-groupGO(gene=geneid, 56 repartition_go <- function(geneid, orgdb, ontology,
49 OrgDb = orgdb, 57 level = 3, readable = TRUE) {
50 ont=ontology, 58 ggo <- groupGO(gene = geneid,
51 level=level, 59 OrgDb = orgdb,
52 readable=TRUE) 60 ont = ontology,
53 61 level = level,
54 ggo@result<-ggo@result[order(-ggo@result$Count),] 62 readable = TRUE)
55 63
56 64
57 if (length(ggo@result$ID) > 0 ) { 65 if (length(ggo@result$ID) > 0) {
58 ggo@result$Description <- sapply(as.vector(ggo@result$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) 66 ggo@result$Description <- sapply(as.vector(ggo@result$Description), #nolint
59 #nb_max_char = max_str_length_10_first(ggo$Description) 67 function(x) {
60 #width = width_by_max_char(nb_max_char) 68 ifelse(nchar(x) > 50,
69 substr(x, 1, 50), x)}, USE.NAMES = FALSE)
70
71
61 name <- paste("GGO_", ontology, "_bar-plot", sep = "") 72 name <- paste("GGO_", ontology, "_bar-plot", sep = "")
62 png(name,height = 720, width = 600) 73 png(name, height = 720, width = 600)
63 p <- barplot(ggo, showCategory=20) 74 p <- barplot(ggo, showCategory = 10)
64 print(p) 75 print(p)
65 dev.off() 76 dev.off()
66 ggo <- as.data.frame(ggo) 77 ggo <- as.data.frame(ggo)
67 return(ggo) 78 return(ggo)
68 } 79 }
69 } 80 }
70 81
82 # nolint end
83
71 # GO over-representation test 84 # GO over-representation test
72 enrich_GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff,plot) { 85 enrich_go <- function(geneid, universe, orgdb, ontology, pval_cutoff,
73 ego<-enrichGO(gene=geneid, 86 qval_cutoff, plot) {
74 universe=universe, 87 ego <- enrichGO(gene = geneid,
75 OrgDb=orgdb, 88
76 ont=ontology, 89 universe = universe,
77 pAdjustMethod="BH", 90 OrgDb = orgdb,
78 pvalueCutoff=pval_cutoff, 91 ont = ontology,
79 qvalueCutoff=qval_cutoff, 92 pAdjustMethod = "BH",
80 readable=TRUE) 93 pvalueCutoff = pval_cutoff,
81 94 qvalueCutoff = qval_cutoff,
95 readable = TRUE)
96
82 # Plot bar & dot plots 97 # Plot bar & dot plots
83 #if there are enriched GopTerms 98 #if there are enriched GoTerms
84 if (length(ego$ID)>0){ 99
85 100
86 ego@result$Description <- sapply(ego@result$Description, function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) 101
87 #nb_max_char = max_str_length_10_first(ego$Description) 102 if (length(ego$ID) > 0) {
88 #width = width_by_max_char(nb_max_char) 103
89 104 ego@result$Description <- sapply(ego@result$Description, function(x) { #nolint
90 if ("dotplot" %in% plot ){ 105 ifelse(nchar(x) > 50, substr(x, 1, 50), x)
106 }, USE.NAMES = FALSE)
107
108
109 if ("dotplot" %in% plot) {
91 dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "") 110 dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "")
92 png(dot_name,height = 720, width = 600) 111 png(dot_name, height = 720, width = 600)
93 p <- dotplot(ego, showCategory=20) 112 p <- dotplot(ego, showCategory = 10) #nolint
94 print(p) 113 print(p)
95 dev.off() 114 dev.off()
96 } 115 }
97 116
98 if ("barplot" %in% plot ){ 117 if ("barplot" %in% plot) {
99 bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "") 118 bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "")
100 png(bar_name,height = 720, width = 600) 119 png(bar_name, height = 720, width = 600)
101 p <- barplot(ego, showCategory=20) 120 p <- barplot(ego, showCategory = 10)
102 print(p) 121 print(p)
103 dev.off() 122 dev.off()
104 123 }
105 } 124
106 ego <- as.data.frame(ego) 125 ego <- as.data.frame(ego)
107 return(ego) 126 return(ego)
108 } else { 127 } else {
109 warning(paste("No Go terms enriched (EGO) found for ",ontology,"ontology"),immediate. = TRUE,noBreaks. = TRUE,call. = FALSE) 128 warning(paste("No Go terms enriched (EGO) found for ",
110 } 129 ontology, "ontology"), immediate. = TRUE, noBreaks. = TRUE, call. = FALSE)
111 } 130 }
112 131 }
113 clean_ids <- function(ids){ 132
114 ids = gsub(" ","",ids) 133
115 ids = ids[which(ids!="")] 134 clean_ids <- function(ids) {
116 ids = ids[which(ids!="NA")] 135 ids <- gsub(" ", "", ids)
117 ids = ids[!is.na(ids)] 136 ids <- ids[which(ids != "")]
118 137 ids <- ids[which(ids != "NA")]
119 return(ids) 138 ids <- ids[!is.na(ids)]
120 } 139
121 140 return(ids)
122 check_ids <- function(vector,type) { 141 }
123 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" 142
124 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" 143 check_ids <- function(vector, type) {
125 if (type == "entrez") 144 uniprot_pattern <- "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" #nolint
126 return(grepl(entrez_id,vector)) 145 entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" #nolint
127 else if (type == "uniprot") { 146 if (type == "entrez") {
128 return(grepl(uniprot_pattern,vector)) 147 return(grepl(entrez_id, vector))
129 } 148 }else if (type == "uniprot") {
130 } 149 return(grepl(uniprot_pattern, vector))
131 150 }
132 get_args <- function(){ 151 }
152
153 get_args <- function() {
133 args <- commandArgs(TRUE) 154 args <- commandArgs(TRUE)
134 if(length(args)<1) { 155 if (length(args) < 1) {
135 args <- c("--help") 156 args <- c("--help")
136 } 157 }
137 158
138 # Help section 159 # Help section
139 if("--help" %in% args) { 160 if ("--help" %in% args) {
140 cat("clusterProfiler Enrichment Analysis 161 cat("clusterProfiler Enrichment Analysis
141 Arguments: 162 Arguments:
142 --input_type: type of input (list of id or filename) 163 --input_type: type of input (list of id or filename)
143 --input: input 164 --input: input
144 --ncol: the column number which contains list of input IDs 165 --ncol: the column number which contains list of input IDs
153 --onto_opt: ontology options 174 --onto_opt: ontology options
154 --go_function: groupGO/enrichGO 175 --go_function: groupGO/enrichGO
155 --level: 1-3 176 --level: 1-3
156 --pval_cutoff 177 --pval_cutoff
157 --qval_cutoff 178 --qval_cutoff
158 --text_output: text output filename 179 --text_output: text output filename
159 --plot : type of visualization, dotplot or/and barplot \n") 180 --plot : type of visualization, dotplot or/and barplot \n")
160 q(save="no") 181 q(save = "no")
161 } 182 }
162 # Parse arguments 183 # Parse arguments
163 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") 184
164 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) 185 parseargs <- function(x) strsplit(sub("^--", "", x), "=")
165 args <- as.list(as.character(argsDF$V2)) 186 argsdf <- as.data.frame(do.call("rbind", parseargs(args)))
166 names(args) <- argsDF$V1 187 args <- as.list(as.character(argsdf$V2))
167 188 names(args) <- argsdf$V1
168 return(args) 189 return(args)
169 } 190 }
170 191
171 192 main <- function() { #nolint
172 main <- function() {
173
174 #get args from command 193 #get args from command
175 args <- get_args() 194 args <- get_args()
176 195
177 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda") 196 go_represent <- str2bool(args$go_represent)
178 #load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda") 197 go_enrich <- str2bool(args$go_enrich)
179 198 if (go_enrich) {
180 go_represent=str2bool(args$go_represent) 199 plot <- unlist(strsplit(args$plot, ","))
181 go_enrich=str2bool(args$go_enrich) 200 }
182 if (go_enrich){ 201
183 plot = unlist(strsplit(args$plot,","))
184 }
185
186 suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE)) 202 suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE))
187 203
188 # Extract OrgDb 204 # Extract OrgDb
189 if (args$species=="org.Hs.eg.db") { 205 if (args$species == "org.Hs.eg.db") {
190 orgdb<-org.Hs.eg.db 206 orgdb <- org.Hs.eg.db #nolint
191 } else if (args$species=="org.Mm.eg.db") { 207 } else if (args$species == "org.Mm.eg.db") {
192 orgdb<-org.Mm.eg.db 208 orgdb <- org.Mm.eg.db #nolint
193 } else if (args$species=="org.Rn.eg.db") { 209 } else if (args$species == "org.Rn.eg.db") {
194 orgdb<-org.Rn.eg.db 210 orgdb <- org.Rn.eg.db #nolint
195 } 211 }
196 212
197 # Extract input IDs 213 # Extract input IDs
198 input_type = args$input_type 214 input_type <- args$input_type
199 id_type = args$id_type 215 id_type <- args$id_type
200 216
201 if (input_type == "text") { 217 if (input_type == "text") {
202 input = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";")) 218 input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
203 } else if (input_type == "file") { 219 } else if (input_type == "file") {
204 filename = args$input 220 filename <- args$input
205 ncol = args$ncol 221 ncol <- args$ncol
206 # Check ncol 222 # Check ncol
207 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { 223 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
208 stop("Please enter the right format for column number: c[number]") 224 stop("Please enter the right format for column number: c[number]")
209 } else { 225 } else {
210 ncol = as.numeric(gsub("c", "", ncol)) 226 ncol <- as.numeric(gsub("c", "", ncol))
211 } 227 }
212 header = str2bool(args$header) # Get file content 228
213 file = read_file(filename, header) # Extract Protein IDs list 229 header <- str2bool(args$header) # Get file content
214 input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) 230 file <- read_file(filename, header) # Extract Protein IDs list
215 } 231 input <- unlist(sapply(as.character(file[, ncol]),
216 input = clean_ids(input) 232 function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
217 233 }
234 input <- clean_ids(input)
235
218 ## Get input gene list from input IDs 236 ## Get input gene list from input IDs
219 #ID format Conversion 237 #ID format Conversion
220 #This case : from UNIPROT (protein id) to ENTREZ (gene id) 238 #This case : from UNIPROT (protein id) to ENTREZ (gene id)
221 #bitr = conversion function from clusterProfiler 239 #bitr = conversion function from clusterProfiler
222 if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) { 240 if (id_type == "Uniprot" & any(check_ids(input, "uniprot"))) {
223 any(check_ids(input,"uniprot")) 241 any(check_ids(input, "uniprot"))
224 idFrom<-"UNIPROT" 242
225 idTo<-"ENTREZID" 243 idfrom <- "UNIPROT"
226 suppressMessages(gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb)) 244 idto <- "ENTREZID"
227 gene<-unique(gene$ENTREZID) 245 suppressMessages(gene <- bitr(input, fromType = idfrom, toType = idto,
228 } else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) { 246 OrgDb = orgdb))
229 gene<-unique(input) 247
248 gene <- unique(gene$ENTREZID)
249 } else if (id_type == "Entrez" & any(check_ids(input, "entrez"))) {
250 gene <- unique(input)
230 } else { 251 } else {
231 stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file")) 252
253 stop(paste(id_type, "not found in your ids list,
254 please check your IDs in input or the selected column of your input file"))
255
232 } 256 }
233 257
234 ontology <- strsplit(args$onto_opt, ",")[[1]] 258 ontology <- strsplit(args$onto_opt, ",")[[1]]
235 259
236 ## Extract GGO/EGO arguments 260 ## Extract GGO/EGO arguments
237 if (go_represent) {level <- as.numeric(args$level)} 261 if (go_represent) {
262 level <- as.numeric(args$level)
263
264 }
265
238 if (go_enrich) { 266 if (go_enrich) {
239 pval_cutoff <- as.numeric(args$pval_cutoff) 267 pval_cutoff <- as.numeric(args$pval_cutoff)
240 qval_cutoff <- as.numeric(args$qval_cutoff) 268 qval_cutoff <- as.numeric(args$qval_cutoff)
241 # Extract universe background genes (same as input file) 269 # Extract universe background genes (same as input file)
242 if (!is.null(args$universe_type)) { 270 if (!is.null(args$universe_type)) {
243 universe_type = args$universe_type 271 universe_type <- args$universe_type
244 if (universe_type == "text") { 272 if (universe_type == "text") {
245 universe = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";")) 273 universe <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
246 } else if (universe_type == "file") { 274 } else if (universe_type == "file") {
247 universe_filename = args$universe 275 universe_filename <- args$universe
248 universe_ncol = args$uncol 276 universe_ncol <- args$uncol
249 # Check ncol 277 # Check ncol
250 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) { 278 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
251 stop("Please enter the right format for column number: c[number]") 279 stop("Please enter the right format for column number: c[number]")
252 } else { 280 } else {
253 universe_ncol = as.numeric(gsub("c", "", universe_ncol)) 281 universe_ncol <- as.numeric(gsub("c", "", universe_ncol))
254 } 282 }
255 universe_header = str2bool(args$uheader) 283 universe_header <- str2bool(args$uheader)
256 # Get file content 284 # Get file content
257 universe_file = read_file(universe_filename, universe_header) 285 universe_file <- read_file(universe_filename, universe_header)
258 # Extract Protein IDs list 286 # Extract Protein IDs list
259 universe <- unlist(sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) 287 universe <- unlist(sapply(universe_file[, universe_ncol],
288 function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
260 } 289 }
261 universe = clean_ids(input) 290 universe <- clean_ids(input)
262 universe_id_type = args$universe_id_type 291 universe_id_type <- args$universe_id_type
263 ##to initialize 292 ##to initialize
264 if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) { 293 if (universe_id_type == "Uniprot" & any(check_ids(universe, "uniprot"))) {
265 idFrom<-"UNIPROT" 294 idfrom <- "UNIPROT"
266 idTo<-"ENTREZID" 295 idto <- "ENTREZID"
267 suppressMessages(universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb)) 296 suppressMessages(universe_gene <- bitr(universe, fromType = idfrom,
268 universe_gene<-unique(universe_gene$ENTREZID) 297 toType = idto, OrgDb = orgdb))
269 } else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) { 298 universe_gene <- unique(universe_gene$ENTREZID)
270 universe_gene<-unique(unlist(universe)) 299 } else if (universe_id_type == "Entrez" &
300 any(check_ids(universe, "entrez"))) {
301 universe_gene <- unique(unlist(universe))
271 } else { 302 } else {
272 if (universe_type=="text"){ 303 if (universe_type == "text") {
273 print(paste(universe_id_type,"not found in your background IDs list",sep=" ")) 304 print(paste(universe_id_type, "not found in your background IDs list",
305 sep = " "))
274 } else { 306 } else {
275 print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" ")) 307 print(paste(universe_id_type, "not found in the column",
308 universe_ncol, "of your background IDs file", sep = " "))
309
276 } 310 }
277 universe_gene = NULL 311 universe_gene <- NULL
278 } 312 }
279 } else { 313 } else {
280 universe_gene = NULL 314 universe_gene <- NULL
281 } 315 }
282 } else { 316 } else {
283 universe_gene = NULL 317 universe_gene <- NULL
284 } 318 }
285 319
286 ##enrichGO : GO over-representation test 320 ##enrichGO : GO over-representation test
287 for (onto in ontology) { 321 for (onto in ontology) {
288 if (go_represent) { 322 if (go_represent) {
289 ggo<-repartition_GO(gene, orgdb, onto, level, readable=TRUE) 323 ggo <- repartition_go(gene, orgdb, onto, level, readable = TRUE)
290 if (is.list(ggo)){ggo <- as.data.frame(apply(ggo, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA 324 if (is.list(ggo)) {
291 output_path = paste("cluster_profiler_GGO_",onto,".tsv",sep="") 325 ggo <- as.data.frame(apply(ggo, c(1, 2),
292 write.table(ggo, output_path, sep="\t", row.names = FALSE, quote = FALSE ) 326 function(x) gsub("^$|^ $", NA, x)))
327 } #convert "" and " " to NA
328 output_path <- paste("cluster_profiler_GGO_", onto, ".tsv", sep = "")
329 write.table(ggo, output_path, sep = "\t",
330 row.names = FALSE, quote = FALSE)
331
293 } 332 }
294 333
295 if (go_enrich) { 334 if (go_enrich) {
296 ego<-enrich_GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff,plot) 335 ego <- enrich_go(gene, universe_gene, orgdb, onto, pval_cutoff,
297 if (is.list(ego)){ego <- as.data.frame(apply(ego, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA 336 qval_cutoff, plot)
298 output_path = paste("cluster_profiler_EGO_",onto,".tsv",sep="") 337 if (is.list(ego)) {
299 write.table(ego, output_path, sep="\t", row.names = FALSE, quote = FALSE ) 338 ego <- as.data.frame(apply(ego, c(1, 2),
300 } 339 function(x) gsub("^$|^ $", NA, x)))
301 } 340 } #convert "" and " " to NA
302 } 341 output_path <- paste("cluster_profiler_EGO_", onto, ".tsv", sep = "")
303 342 write.table(ego, output_path, sep = "\t",
304 if(!interactive()) { 343 row.names = FALSE, quote = FALSE)
344 }
345 }
346 }
347
348 if (!interactive()) {
305 main() 349 main()
306 } 350 }