Mercurial > repos > proteore > proteore_prot_features
changeset 1:2fc914ab92f5 draft
planemo upload commit f4517998e11df15ce84f98d2b209e5c43b572c8d-dirty
| author | proteore |
|---|---|
| date | Tue, 04 Dec 2018 05:48:58 -0500 |
| parents | b455ec3f4f33 |
| children | 9525c7c2d5d6 |
| files | prot_features.xml protein_features.R |
| diffstat | 2 files changed, 59 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/prot_features.xml Mon Nov 12 11:10:16 2018 -0500 +++ b/prot_features.xml Tue Dec 04 05:48:58 2018 -0500 @@ -1,4 +1,4 @@ -<tool id="prot_features" name="Add human protein features" version="2018.11.12"> +<tool id="prot_features" name="Add human protein features" version="2018.12.04"> <description>(neXtProt) </description> <requirements> @@ -134,7 +134,7 @@ **Authors** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +David Christiany, Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
--- a/protein_features.R Mon Nov 12 11:10:16 2018 -0500 +++ b/protein_features.R Tue Dec 04 05:48:58 2018 -0500 @@ -10,7 +10,7 @@ } order_columns <- function (df,ncol,id_type,file){ - if (id_type=="Uniprot_AC"){ncol=dim.data.frame(file)[2]} + if (id_type=="Uniprot_AC"){ncol=ncol(file)} if (ncol==1){ #already at the right position return (df) } else { @@ -72,6 +72,45 @@ } } +#take data frame, return data frame +split_ids_per_line <- function(line,ncol){ + + #print (line) + header = colnames(line) + line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) + + if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { + if (length(line)==1 ) { + lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) + } else { + if (ncol==1) { #first column + lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) + } else if (ncol==length(line)) { #last column + lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) + } else { + lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) + } + } + colnames(lines)=header + return(lines) + } else { + return(line) + } +} + +#create new lines if there's more than one id per cell in the columns in order to have only one id per line +one_id_one_line <-function(tab,ncol){ + + tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) + header=colnames(tab) + res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) + for (i in 1:nrow(tab) ) { + lines = split_ids_per_line(tab[i,],ncol) + res = rbind(res,lines) + } + return(res) +} + # Get information from neXtProt get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ if(diseases_info){ @@ -96,7 +135,8 @@ inputtype = args$inputtype if (inputtype == "copy_paste") { input = get_list_from_cp(args$input) - input = input[input!=""] + input = input[which(!is.na(input[input!=""]))] + ncol=1 } else if (inputtype == "file") { filename = args$input ncol = args$column @@ -108,7 +148,8 @@ } header = str2bool(args$header) - file = read_file(filename, header) # Get file content + file = read_file(filename, header) # Get file content + if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)} input = sapply(file[,ncol],function(x) strsplit(as.character(x),";")[[1]][1],USE.NAMES = F) # Extract Protein IDs list if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { @@ -131,34 +172,31 @@ # Nextprot data if (id_type=="Uniprot_AC"){ NextprotID = gsub("^","NX_",input) - if (inputtype == "file" && "NextprotID" %in% colnames(file)){colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"} - file = cbind(file,NextprotID) + if (inputtype == "file" && "NextprotID" %in% colnames(file)){ + colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" + file = cbind(file,NextprotID) + } else { + file = data.frame(cbind(input,NextprotID)) + colnames(file)[1]="Uniprot-AC" + } } else if (id_type=="NextprotID") { if (inputtype == "file") { NextprotID = file$NextprotID } else { NextprotID = input + file=data.frame(NextprotID) } } - # Select user input protein ids in nextprot + #Select user input protein ids in nextprot + NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))]) if ((length(NextprotID[NextprotID %in% nextprot[,1]]))==0){ write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) } else { res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) - - # Write output - if (inputtype == "copy_paste") { - if (id_type=="Uniprot_AC"){ - output_content = cbind(input, res) - colnames(output_content)[1] = id_type - } - if ("res" %in% colnames(output_content)){colnames(output_content)[which(colnames(output_content)=="res")] = "NexprotID" } #if no features are selected - } else if (inputtype == "file") { - res = res[!duplicated(res$NextprotID),] - output_content = merge(file, res,by="NextprotID",incomparables = NA,all.x=T) - output_content = order_columns(output_content,ncol,id_type,file) - } + res = res[!duplicated(res$NextprotID),] + output_content = merge(file, res,by="NextprotID",incomparables = NA,all.x=T) + output_content = order_columns(output_content,ncol,id_type,file) output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) }
