# HG changeset patch # User proteore # Date 1544691477 18000 # Node ID 759850de6ed2990e82bc2dc709ceb39918e95cc6 # Parent 7746af0f8209c1bff0574890e583eaec38775638 planemo upload commit c599cfc156c77626df2b674bdfbd437b9f664ab9 diff -r 7746af0f8209 -r 759850de6ed2 add_protein_features.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_protein_features.R Thu Dec 13 03:57:57 2018 -0500 @@ -0,0 +1,202 @@ +# Read file and return file content as data.frame +read_file <- function(path,header){ + file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE) + if (inherits(file,"try-error")){ + stop("File not found !") + }else{ + file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] + return(file) + } +} + +order_columns <- function (df,ncol,id_type,file){ + if (id_type=="Uniprot_AC"){ncol=ncol(file)} + if (ncol==1){ #already at the right position + return (df) + } else { + df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] + } + return (df) +} + +get_list_from_cp <-function(list){ + list = strsplit(list, "[ \t\n]+")[[1]] + list = gsub("NA","",list) + list = list[list != ""] #remove empty entry + list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") + return(list) +} + +get_args <- function(){ + + ## Collect arguments + args <- commandArgs(TRUE) + + ## Default setting when no arguments passed + if(length(args) < 1) { + args <- c("--help") + } + + ## Help section + if("--help" %in% args) { + cat("Selection and Annotation HPA + Arguments: + --inputtype: type of input (list of id or filename) + --input: input + --nextprot: path to nextprot information file + --column: the column number which you would like to apply... + --header: true/false if your file contains a header + --type: the type of input IDs (Uniprot_AC/EntrezID) + --pc_features: IsoPoint,SeqLength,MW + --localization: Chr,SubcellLocations + --diseases_info: Diseases + --output: text output filename \n") + + q(save="no") + } + + parseArgs <- function(x) strsplit(sub("^--", "", x), "=") + argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) + args <- as.list(as.character(argsDF$V2)) + names(args) <- argsDF$V1 + + return(args) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +#take data frame, return data frame +split_ids_per_line <- function(line,ncol){ + + #print (line) + header = colnames(line) + line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) + + if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { + if (length(line)==1 ) { + lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) + } else { + if (ncol==1) { #first column + lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) + } else if (ncol==length(line)) { #last column + lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) + } else { + lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) + } + } + colnames(lines)=header + return(lines) + } else { + return(line) + } +} + +#create new lines if there's more than one id per cell in the columns in order to have only one id per line +one_id_one_line <-function(tab,ncol){ + + if (ncol(tab)>1){ + + tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) + header=colnames(tab) + res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) + for (i in 1:nrow(tab) ) { + lines = split_ids_per_line(tab[i,],ncol) + res = rbind(res,lines) + } + }else { + res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) + res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) + colnames(res)=colnames(tab) + } + return(res) +} + +# Get information from neXtProt +get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ + if(diseases_info){ + cols = c("NextprotID",pc_features,localization,"Diseases") + } else { + cols = c("NextprotID",pc_features,localization) + } + + cols=cols[cols!="None"] + info = nextprot[match(input,nextprot$NextprotID),cols] + return(info) +} + +protein_features = function() { + + args <- get_args() + + #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") + #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") + + #setting variables + inputtype = args$inputtype + if (inputtype == "copy_paste") { + input = get_list_from_cp(args$input) + file = data.frame(input,stringsAsFactors = F) + ncol=1 + } else if (inputtype == "file") { + filename = args$input + ncol = args$column + # Check ncol + if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { + stop("Please enter an integer for level") + } else { + ncol = as.numeric(gsub("c", "", ncol)) + } + + header = str2bool(args$header) + file = read_file(filename, header) # Get file content + if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)} + if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" + } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { + colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" + colnames(file)[ncol] = "NextprotID" + } + } + + # Read reference file + nextprot = read_file(args$nextprot,T) + + # Parse arguments + id_type = args$type + pc_features = strsplit(args$pc_features, ",")[[1]] + localization = strsplit(args$localization, ",")[[1]] + diseases_info = str2bool(args$diseases_info) + output = args$output + + # Change the sample ids if they are Uniprot_AC ids to be able to match them with + # Nextprot data + if (id_type=="Uniprot_AC"){ + NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol])) + file = cbind(file,NextprotID) + if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"} + ncol=ncol(file) + } + NextprotID = file[,ncol] + + #Select user input protein ids in nextprot + #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))]) + if (all(!NextprotID %in% nextprot[,1])){ + write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) + } else { + res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) + res = res[!duplicated(res$NextprotID),] + output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T) + output_content = order_columns(output_content,ncol,id_type,file) + output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA + write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) + } + +} +protein_features() diff -r 7746af0f8209 -r 759850de6ed2 add_protein_features.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_protein_features.xml Thu Dec 13 03:57:57 2018 -0500 @@ -0,0 +1,149 @@ + +[neXtProt] + + + R + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + +
+ +
+ + + + + inputtype=="file" + + + + + + + + + + + + + + + +
+ + + +
+ + +
+
+ + + + + +
diff -r 7746af0f8209 -r 759850de6ed2 prot_features.xml --- a/prot_features.xml Fri Dec 07 05:09:57 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,149 +0,0 @@ - -(neXtProt) - - - R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - -
- -
- - - - - inputtype=="file" - - - - - - - - - - - - - - - -
- - - -
- - -
-
- - - - - -
diff -r 7746af0f8209 -r 759850de6ed2 protein_features.R --- a/protein_features.R Fri Dec 07 05:09:57 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -# Read file and return file content as data.frame -read_file <- function(path,header){ - file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE) - if (inherits(file,"try-error")){ - stop("File not found !") - }else{ - file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] - return(file) - } -} - -order_columns <- function (df,ncol,id_type,file){ - if (id_type=="Uniprot_AC"){ncol=ncol(file)} - if (ncol==1){ #already at the right position - return (df) - } else { - df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] - } - return (df) -} - -get_list_from_cp <-function(list){ - list = strsplit(list, "[ \t\n]+")[[1]] - list = gsub("NA","",list) - list = list[list != ""] #remove empty entry - list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") - return(list) -} - -get_args <- function(){ - - ## Collect arguments - args <- commandArgs(TRUE) - - ## Default setting when no arguments passed - if(length(args) < 1) { - args <- c("--help") - } - - ## Help section - if("--help" %in% args) { - cat("Selection and Annotation HPA - Arguments: - --inputtype: type of input (list of id or filename) - --input: input - --nextprot: path to nextprot information file - --column: the column number which you would like to apply... - --header: true/false if your file contains a header - --type: the type of input IDs (Uniprot_AC/EntrezID) - --pc_features: IsoPoint,SeqLength,MW - --localization: Chr,SubcellLocations - --diseases_info: Diseases - --output: text output filename \n") - - q(save="no") - } - - parseArgs <- function(x) strsplit(sub("^--", "", x), "=") - argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) - args <- as.list(as.character(argsDF$V2)) - names(args) <- argsDF$V1 - - return(args) -} - -str2bool <- function(x){ - if (any(is.element(c("t","true"),tolower(x)))){ - return (TRUE) - }else if (any(is.element(c("f","false"),tolower(x)))){ - return (FALSE) - }else{ - return(NULL) - } -} - -#take data frame, return data frame -split_ids_per_line <- function(line,ncol){ - - #print (line) - header = colnames(line) - line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) - - if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { - if (length(line)==1 ) { - lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) - } else { - if (ncol==1) { #first column - lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) - } else if (ncol==length(line)) { #last column - lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) - } else { - lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) - } - } - colnames(lines)=header - return(lines) - } else { - return(line) - } -} - -#create new lines if there's more than one id per cell in the columns in order to have only one id per line -one_id_one_line <-function(tab,ncol){ - - if (ncol(tab)>1){ - - tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) - header=colnames(tab) - res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) - for (i in 1:nrow(tab) ) { - lines = split_ids_per_line(tab[i,],ncol) - res = rbind(res,lines) - } - }else { - res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) - res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) - colnames(res)=colnames(tab) - } - return(res) -} - -# Get information from neXtProt -get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ - if(diseases_info){ - cols = c("NextprotID",pc_features,localization,"Diseases") - } else { - cols = c("NextprotID",pc_features,localization) - } - - cols=cols[cols!="None"] - info = nextprot[match(input,nextprot$NextprotID),cols] - return(info) -} - -protein_features = function() { - - args <- get_args() - - #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") - #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_human_protein_features/args.rda") - - #setting variables - inputtype = args$inputtype - if (inputtype == "copy_paste") { - input = get_list_from_cp(args$input) - file = data.frame(input,stringsAsFactors = F) - ncol=1 - } else if (inputtype == "file") { - filename = args$input - ncol = args$column - # Check ncol - if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { - stop("Please enter an integer for level") - } else { - ncol = as.numeric(gsub("c", "", ncol)) - } - - header = str2bool(args$header) - file = read_file(filename, header) # Get file content - if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)} - if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" - } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { - colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" - colnames(file)[ncol] = "NextprotID" - } - } - - # Read reference file - nextprot = read_file(args$nextprot,T) - - # Parse arguments - id_type = args$type - pc_features = strsplit(args$pc_features, ",")[[1]] - localization = strsplit(args$localization, ",")[[1]] - diseases_info = str2bool(args$diseases_info) - output = args$output - - # Change the sample ids if they are Uniprot_AC ids to be able to match them with - # Nextprot data - if (id_type=="Uniprot_AC"){ - NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol])) - file = cbind(file,NextprotID) - if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"} - ncol=ncol(file) - } - NextprotID = file[,ncol] - - #Select user input protein ids in nextprot - #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))]) - if (all(!NextprotID %in% nextprot[,1])){ - write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) - } else { - res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) - res = res[!duplicated(res$NextprotID),] - output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T) - output_content = order_columns(output_content,ncol,id_type,file) - output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA - write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) - } - -} -protein_features()