# HG changeset patch # User proteore # Date 1561622186 14400 # Node ID ea59f5750c512a35c866967323e98a068e0cb20f # Parent e919b55188ab4d1312b2a0e429fb99dfe9bcb667 planemo upload commit fb27a6b5de5cd7b269a41be3c85c593b77aa1b18-dirty diff -r e919b55188ab -r ea59f5750c51 add_expression_HPA.R --- a/add_expression_HPA.R Wed Jan 02 04:26:18 2019 -0500 +++ b/add_expression_HPA.R Thu Jun 27 03:56:26 2019 -0400 @@ -19,6 +19,24 @@ } } +stopQuietly <- function(...) { + blankMsg <- sprintf("\r%s\r", paste(rep(" ", getOption("width")-1L), collapse=" ")); + stop(simpleError(blankMsg)); +} # stopQuietly() + +check_ensembl_geneids <- function(vector,type) { + ensembl_geneid_pattern = "^ENS[A-Z]+[0-9]{11}$|^[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?$|^CG[0-9]+$|^[A-Z0-9]+[.][0-9]+$|^YM[A-Z][0-9]{3}[a-z][0-9]$" + res = grepl(ensembl_geneid_pattern,vector) + if (all(!res)){ + cat("No Ensembl geneIDs found in entered ids") + stopQuietly() + } else if (any(!res)) { + cat(paste(sep="",collapse = " ",c(sum(!res, na.rm=TRUE),'IDs are not ENSG IDs, please check:\n'))) + not_geneids <- sapply(vector[which(!res)], function(x) paste(sep="",collapse = "",x,"\n"),USE.NAMES = F) + cat(not_geneids) + } +} + add_expression = function(input, atlas, options) { input <- unique(input[!is.na(input)]) input <- gsub("[[:blank:]]|\u00A0","",input) @@ -89,7 +107,7 @@ return(res) } -main = function() { +get_args <- function(){ args <- commandArgs(TRUE) if(length(args)<1) { args <- c("--help") @@ -116,13 +134,28 @@ argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) args <- as.list(as.character(argsDF$V2)) names(args) <- argsDF$V1 + + return(args) +} + +is_col_in_file <- function(file,ncol) { + is_in_file = (ncol <= ncol(file) && ncol > 0) + if (!is_in_file){ + cat(paste(sep = "", collapse = " ", c("Column",ncol,"not found in file") )) + stopQuietly() + } +} + +main = function() { + + args = get_args() #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda") #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_expression_data_HPA/args.rda") inputtype = args$inputtype if (inputtype == "copypaste") { - input = strsplit(args$input, "[ \t\n]+")[[1]] + ids = strsplit(args$input, "[ \t\n]+")[[1]] } else if (inputtype == "tabfile") { filename = args$input ncol = args$column @@ -134,10 +167,12 @@ } header = str2bool(args$header) file = read_file(filename, header) + is_col_in_file(file,ncol) file = one_id_one_line(file,ncol) - input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) - input = input[which(!is.na(input))] + ids = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) + ids = ids[which(!is.na(ids))] } + check_ensembl_geneids(ids) # Read protein atlas protein_atlas = args$atlas @@ -146,15 +181,15 @@ # Add expression output = args$output options = strsplit(args$select, ",")[[1]] - res = add_expression(input, protein_atlas, options) + res = add_expression(ids, protein_atlas, options) # Write output if (is.null(res)) { - write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) + write.table("None of the ENSG ids entered can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) } else { if (inputtype == "copypaste") { - input <- data.frame(input) - output_content = merge(input,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T) + ids <- data.frame(ids) + output_content = merge(ids,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T) colnames(output_content)[1] = "Ensembl" } else if (inputtype == "tabfile") { output_content = merge(file, res, by.x=ncol, by.y="row.names", incomparables = NA, all.x=T) diff -r e919b55188ab -r ea59f5750c51 add_expression_data.xml --- a/add_expression_data.xml Wed Jan 02 04:26:18 2019 -0500 +++ b/add_expression_data.xml Thu Jun 27 03:56:26 2019 -0400 @@ -1,4 +1,4 @@ - + (RNAseq or Immuno-assays)[Human Protein Atlas] @@ -24,7 +24,7 @@ - + @@ -42,11 +42,10 @@ - - - - + + [c]{0,1}[0-9]+ +
@@ -95,7 +94,11 @@ **Input** -Input can be either a list of Ensembl gene (ENSG) IDsds (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs. +Input can be either a list of Ensembl gene (ENSG) IDs (copy/paste mode) or a file containing multiple fields with at least one column of Ensembl gene IDs. If your input file contains other type of IDs, please use the ID_Converter tool to create a column of Ensembl gene IDs. + +.. class:: warningmark + +In copy/paste mode, the number of IDs considered in input is limited to 5000. ----- @@ -127,13 +130,13 @@ **Output** -The output is a tabular file containing original columns and new columns including selected annotation. +The output is a tabular file containing initial columns and new columns with annotation from HPA. ----- **Data sources (release date)** -HPA source file (Human Protein Atlas version 18): http://www.proteinatlas.org/download/proteinatlas.tab.gz +HPA source file (data are based on the Human Protein Atlas version 18.1 and Ensembl version 88.38): http://www.proteinatlas.org/download/proteinatlas.tab.gz ----- @@ -141,13 +144,13 @@ **Authors** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Lisa Perus, David Christiany, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR This work has been partially funded through the French National Agency for Research (ANR) IFB project. -Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. +Help: contact@proteore.org for any questions or concerns about this tool. ]]>