Mercurial > repos > proteore > proteore_prot_features
comparison protein_features.R @ 1:2fc914ab92f5 draft
planemo upload commit f4517998e11df15ce84f98d2b209e5c43b572c8d-dirty
| author | proteore |
|---|---|
| date | Tue, 04 Dec 2018 05:48:58 -0500 |
| parents | b455ec3f4f33 |
| children | 9525c7c2d5d6 |
comparison
equal
deleted
inserted
replaced
| 0:b455ec3f4f33 | 1:2fc914ab92f5 |
|---|---|
| 8 return(file) | 8 return(file) |
| 9 } | 9 } |
| 10 } | 10 } |
| 11 | 11 |
| 12 order_columns <- function (df,ncol,id_type,file){ | 12 order_columns <- function (df,ncol,id_type,file){ |
| 13 if (id_type=="Uniprot_AC"){ncol=dim.data.frame(file)[2]} | 13 if (id_type=="Uniprot_AC"){ncol=ncol(file)} |
| 14 if (ncol==1){ #already at the right position | 14 if (ncol==1){ #already at the right position |
| 15 return (df) | 15 return (df) |
| 16 } else { | 16 } else { |
| 17 df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] | 17 df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] |
| 18 } | 18 } |
| 70 }else{ | 70 }else{ |
| 71 return(NULL) | 71 return(NULL) |
| 72 } | 72 } |
| 73 } | 73 } |
| 74 | 74 |
| 75 #take data frame, return data frame | |
| 76 split_ids_per_line <- function(line,ncol){ | |
| 77 | |
| 78 #print (line) | |
| 79 header = colnames(line) | |
| 80 line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) | |
| 81 | |
| 82 if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { | |
| 83 if (length(line)==1 ) { | |
| 84 lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) | |
| 85 } else { | |
| 86 if (ncol==1) { #first column | |
| 87 lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) | |
| 88 } else if (ncol==length(line)) { #last column | |
| 89 lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) | |
| 90 } else { | |
| 91 lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) | |
| 92 } | |
| 93 } | |
| 94 colnames(lines)=header | |
| 95 return(lines) | |
| 96 } else { | |
| 97 return(line) | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 #create new lines if there's more than one id per cell in the columns in order to have only one id per line | |
| 102 one_id_one_line <-function(tab,ncol){ | |
| 103 | |
| 104 tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) | |
| 105 header=colnames(tab) | |
| 106 res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) | |
| 107 for (i in 1:nrow(tab) ) { | |
| 108 lines = split_ids_per_line(tab[i,],ncol) | |
| 109 res = rbind(res,lines) | |
| 110 } | |
| 111 return(res) | |
| 112 } | |
| 113 | |
| 75 # Get information from neXtProt | 114 # Get information from neXtProt |
| 76 get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ | 115 get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ |
| 77 if(diseases_info){ | 116 if(diseases_info){ |
| 78 cols = c("NextprotID",pc_features,localization,"Diseases") | 117 cols = c("NextprotID",pc_features,localization,"Diseases") |
| 79 } else { | 118 } else { |
| 94 | 133 |
| 95 #setting variables | 134 #setting variables |
| 96 inputtype = args$inputtype | 135 inputtype = args$inputtype |
| 97 if (inputtype == "copy_paste") { | 136 if (inputtype == "copy_paste") { |
| 98 input = get_list_from_cp(args$input) | 137 input = get_list_from_cp(args$input) |
| 99 input = input[input!=""] | 138 input = input[which(!is.na(input[input!=""]))] |
| 139 ncol=1 | |
| 100 } else if (inputtype == "file") { | 140 } else if (inputtype == "file") { |
| 101 filename = args$input | 141 filename = args$input |
| 102 ncol = args$column | 142 ncol = args$column |
| 103 # Check ncol | 143 # Check ncol |
| 104 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { | 144 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { |
| 106 } else { | 146 } else { |
| 107 ncol = as.numeric(gsub("c", "", ncol)) | 147 ncol = as.numeric(gsub("c", "", ncol)) |
| 108 } | 148 } |
| 109 | 149 |
| 110 header = str2bool(args$header) | 150 header = str2bool(args$header) |
| 111 file = read_file(filename, header) # Get file content | 151 file = read_file(filename, header) # Get file content |
| 152 if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)} | |
| 112 input = sapply(file[,ncol],function(x) strsplit(as.character(x),";")[[1]][1],USE.NAMES = F) # Extract Protein IDs list | 153 input = sapply(file[,ncol],function(x) strsplit(as.character(x),";")[[1]][1],USE.NAMES = F) # Extract Protein IDs list |
| 113 if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" | 154 if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" |
| 114 } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { | 155 } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { |
| 115 colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" | 156 colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" |
| 116 colnames(file)[ncol] = "NextprotID" | 157 colnames(file)[ncol] = "NextprotID" |
| 129 | 170 |
| 130 # Change the sample ids if they are Uniprot_AC ids to be able to match them with | 171 # Change the sample ids if they are Uniprot_AC ids to be able to match them with |
| 131 # Nextprot data | 172 # Nextprot data |
| 132 if (id_type=="Uniprot_AC"){ | 173 if (id_type=="Uniprot_AC"){ |
| 133 NextprotID = gsub("^","NX_",input) | 174 NextprotID = gsub("^","NX_",input) |
| 134 if (inputtype == "file" && "NextprotID" %in% colnames(file)){colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"} | 175 if (inputtype == "file" && "NextprotID" %in% colnames(file)){ |
| 135 file = cbind(file,NextprotID) | 176 colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" |
| 177 file = cbind(file,NextprotID) | |
| 178 } else { | |
| 179 file = data.frame(cbind(input,NextprotID)) | |
| 180 colnames(file)[1]="Uniprot-AC" | |
| 181 } | |
| 136 } else if (id_type=="NextprotID") { | 182 } else if (id_type=="NextprotID") { |
| 137 if (inputtype == "file") { | 183 if (inputtype == "file") { |
| 138 NextprotID = file$NextprotID | 184 NextprotID = file$NextprotID |
| 139 } else { | 185 } else { |
| 140 NextprotID = input | 186 NextprotID = input |
| 141 } | 187 file=data.frame(NextprotID) |
| 142 } | 188 } |
| 143 | 189 } |
| 144 # Select user input protein ids in nextprot | 190 |
| 191 #Select user input protein ids in nextprot | |
| 192 NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))]) | |
| 145 if ((length(NextprotID[NextprotID %in% nextprot[,1]]))==0){ | 193 if ((length(NextprotID[NextprotID %in% nextprot[,1]]))==0){ |
| 146 write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | 194 write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) |
| 147 } else { | 195 } else { |
| 148 res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) | 196 res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) |
| 149 | 197 res = res[!duplicated(res$NextprotID),] |
| 150 # Write output | 198 output_content = merge(file, res,by="NextprotID",incomparables = NA,all.x=T) |
| 151 if (inputtype == "copy_paste") { | 199 output_content = order_columns(output_content,ncol,id_type,file) |
| 152 if (id_type=="Uniprot_AC"){ | |
| 153 output_content = cbind(input, res) | |
| 154 colnames(output_content)[1] = id_type | |
| 155 } | |
| 156 if ("res" %in% colnames(output_content)){colnames(output_content)[which(colnames(output_content)=="res")] = "NexprotID" } #if no features are selected | |
| 157 } else if (inputtype == "file") { | |
| 158 res = res[!duplicated(res$NextprotID),] | |
| 159 output_content = merge(file, res,by="NextprotID",incomparables = NA,all.x=T) | |
| 160 output_content = order_columns(output_content,ncol,id_type,file) | |
| 161 } | |
| 162 output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA | 200 output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA |
| 163 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) | 201 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) |
| 164 } | 202 } |
| 165 | 203 |
| 166 } | 204 } |
