Mercurial > repos > proteore > proteore_expression_rnaseq_abbased
diff add_expression_HPA.R @ 2:70e3b70ded22 draft
planemo upload commit f4517998e11df15ce84f98d2b209e5c43b572c8d-dirty
| author | proteore |
|---|---|
| date | Tue, 04 Dec 2018 05:47:23 -0500 |
| parents | 03487ed0f458 |
| children | d7d2a4512059 |
line wrap: on
line diff
--- a/add_expression_HPA.R Wed Nov 14 05:34:14 2018 -0500 +++ b/add_expression_HPA.R Tue Dec 04 05:47:23 2018 -0500 @@ -20,15 +20,68 @@ } add_expression = function(input, atlas, options) { + input <- unique(input[!is.na(input)]) + input <- gsub("[[:blank:]]","",input) if (all(!input %in% atlas$Ensembl)) { return(NULL) } else { res = atlas[match(input,atlas$Ensembl),c("Ensembl",options)] + res = res[which(!is.na(res[,1])),] + row.names(res)=res[,1] + res=res[2:ncol(res)] res <- as.data.frame(apply(res, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA return(res) } } +order_columns <- function (df,ncol){ + if (ncol==1){ #already at the right position + return (df) + } else { + df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] + } + return (df) +} + +#take data frame, return data frame +split_ids_per_line <- function(line,ncol){ + + #print (line) + header = colnames(line) + line[ncol] = gsub("[[:blank:]]","",line[ncol]) + + if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { + if (length(line)==1 ) { + lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) + } else { + if (ncol==1) { #first column + lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) + } else if (ncol==length(line)) { #last column + lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) + } else { + lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) + } + } + colnames(lines)=header + return(lines) + } else { + return(line) + } +} + +#create new lines if there's more than one id per cell in the columns in order to have only one id per line +one_id_one_line <-function(tab,ncol){ + + tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) + header=colnames(tab) + res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) + for (i in 1:nrow(tab) ) { + lines = split_ids_per_line(tab[i,],ncol) + res = rbind(res,lines) + } + return(res) +} + main = function() { args <- commandArgs(TRUE) if(length(args)<1) { @@ -74,7 +127,9 @@ } header = str2bool(args$header) file = read_file(filename, header) - input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) + file = one_id_one_line(file,ncol) + input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) + input = input[which(!is.na(input))] } # Read protein atlas @@ -86,17 +141,17 @@ options = strsplit(args$select, ",")[[1]] res = add_expression(input, protein_atlas, options) - # Write output if (is.null(res)) { write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) } else { if (inputtype == "copypaste") { - output_content = cbind(as.matrix(input), res) + input <- data.frame(input) + output_content = merge(input,res,by.x=1,by.y="row.names",incomparables = NA, all.x=T) colnames(output_content)[1] = "Ensembl" } else if (inputtype == "tabfile") { - output_content = merge(file, res, by.x=ncol, by.y=1, incomparables = NA,all.x=T) - output_content = output_content[order(output_content[,ncol],decreasing = T),] + output_content = merge(file, res, by.x=ncol, by.y="row.names", incomparables = NA, all.x=T) + output_content = order_columns(output_content,ncol) } output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
