comparison protein_features.R @ 1:2fc914ab92f5 draft

planemo upload commit f4517998e11df15ce84f98d2b209e5c43b572c8d-dirty
author proteore
date Tue, 04 Dec 2018 05:48:58 -0500
parents b455ec3f4f33
children 9525c7c2d5d6
comparison
equal deleted inserted replaced
0:b455ec3f4f33 1:2fc914ab92f5
8 return(file) 8 return(file)
9 } 9 }
10 } 10 }
11 11
12 order_columns <- function (df,ncol,id_type,file){ 12 order_columns <- function (df,ncol,id_type,file){
13 if (id_type=="Uniprot_AC"){ncol=dim.data.frame(file)[2]} 13 if (id_type=="Uniprot_AC"){ncol=ncol(file)}
14 if (ncol==1){ #already at the right position 14 if (ncol==1){ #already at the right position
15 return (df) 15 return (df)
16 } else { 16 } else {
17 df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])] 17 df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])]
18 } 18 }
70 }else{ 70 }else{
71 return(NULL) 71 return(NULL)
72 } 72 }
73 } 73 }
74 74
75 #take data frame, return data frame
76 split_ids_per_line <- function(line,ncol){
77
78 #print (line)
79 header = colnames(line)
80 line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol])
81
82 if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) {
83 if (length(line)==1 ) {
84 lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F)
85 } else {
86 if (ncol==1) { #first column
87 lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)]))
88 } else if (ncol==length(line)) { #last column
89 lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";"))))
90 } else {
91 lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)]))
92 }
93 }
94 colnames(lines)=header
95 return(lines)
96 } else {
97 return(line)
98 }
99 }
100
101 #create new lines if there's more than one id per cell in the columns in order to have only one id per line
102 one_id_one_line <-function(tab,ncol){
103
104 tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x))
105 header=colnames(tab)
106 res=as.data.frame(matrix(ncol=ncol(tab),nrow=0))
107 for (i in 1:nrow(tab) ) {
108 lines = split_ids_per_line(tab[i,],ncol)
109 res = rbind(res,lines)
110 }
111 return(res)
112 }
113
75 # Get information from neXtProt 114 # Get information from neXtProt
76 get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){ 115 get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){
77 if(diseases_info){ 116 if(diseases_info){
78 cols = c("NextprotID",pc_features,localization,"Diseases") 117 cols = c("NextprotID",pc_features,localization,"Diseases")
79 } else { 118 } else {
94 133
95 #setting variables 134 #setting variables
96 inputtype = args$inputtype 135 inputtype = args$inputtype
97 if (inputtype == "copy_paste") { 136 if (inputtype == "copy_paste") {
98 input = get_list_from_cp(args$input) 137 input = get_list_from_cp(args$input)
99 input = input[input!=""] 138 input = input[which(!is.na(input[input!=""]))]
139 ncol=1
100 } else if (inputtype == "file") { 140 } else if (inputtype == "file") {
101 filename = args$input 141 filename = args$input
102 ncol = args$column 142 ncol = args$column
103 # Check ncol 143 # Check ncol
104 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { 144 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
106 } else { 146 } else {
107 ncol = as.numeric(gsub("c", "", ncol)) 147 ncol = as.numeric(gsub("c", "", ncol))
108 } 148 }
109 149
110 header = str2bool(args$header) 150 header = str2bool(args$header)
111 file = read_file(filename, header) # Get file content 151 file = read_file(filename, header) # Get file content
152 if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)}
112 input = sapply(file[,ncol],function(x) strsplit(as.character(x),";")[[1]][1],USE.NAMES = F) # Extract Protein IDs list 153 input = sapply(file[,ncol],function(x) strsplit(as.character(x),";")[[1]][1],USE.NAMES = F) # Extract Protein IDs list
113 if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" 154 if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID"
114 } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { 155 } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) {
115 colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" 156 colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"
116 colnames(file)[ncol] = "NextprotID" 157 colnames(file)[ncol] = "NextprotID"
129 170
130 # Change the sample ids if they are Uniprot_AC ids to be able to match them with 171 # Change the sample ids if they are Uniprot_AC ids to be able to match them with
131 # Nextprot data 172 # Nextprot data
132 if (id_type=="Uniprot_AC"){ 173 if (id_type=="Uniprot_AC"){
133 NextprotID = gsub("^","NX_",input) 174 NextprotID = gsub("^","NX_",input)
134 if (inputtype == "file" && "NextprotID" %in% colnames(file)){colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"} 175 if (inputtype == "file" && "NextprotID" %in% colnames(file)){
135 file = cbind(file,NextprotID) 176 colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID"
177 file = cbind(file,NextprotID)
178 } else {
179 file = data.frame(cbind(input,NextprotID))
180 colnames(file)[1]="Uniprot-AC"
181 }
136 } else if (id_type=="NextprotID") { 182 } else if (id_type=="NextprotID") {
137 if (inputtype == "file") { 183 if (inputtype == "file") {
138 NextprotID = file$NextprotID 184 NextprotID = file$NextprotID
139 } else { 185 } else {
140 NextprotID = input 186 NextprotID = input
141 } 187 file=data.frame(NextprotID)
142 } 188 }
143 189 }
144 # Select user input protein ids in nextprot 190
191 #Select user input protein ids in nextprot
192 NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))])
145 if ((length(NextprotID[NextprotID %in% nextprot[,1]]))==0){ 193 if ((length(NextprotID[NextprotID %in% nextprot[,1]]))==0){
146 write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) 194 write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
147 } else { 195 } else {
148 res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info) 196 res <- get_nextprot_info(nextprot,NextprotID,pc_features,localization,diseases_info)
149 197 res = res[!duplicated(res$NextprotID),]
150 # Write output 198 output_content = merge(file, res,by="NextprotID",incomparables = NA,all.x=T)
151 if (inputtype == "copy_paste") { 199 output_content = order_columns(output_content,ncol,id_type,file)
152 if (id_type=="Uniprot_AC"){
153 output_content = cbind(input, res)
154 colnames(output_content)[1] = id_type
155 }
156 if ("res" %in% colnames(output_content)){colnames(output_content)[which(colnames(output_content)=="res")] = "NexprotID" } #if no features are selected
157 } else if (inputtype == "file") {
158 res = res[!duplicated(res$NextprotID),]
159 output_content = merge(file, res,by="NextprotID",incomparables = NA,all.x=T)
160 output_content = order_columns(output_content,ncol,id_type,file)
161 }
162 output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA 200 output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA
163 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) 201 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
164 } 202 }
165 203
166 } 204 }