Mercurial > repos > devteam > lda_analysis
annotate lda_analy.xml @ 2:cf85ea165ce0 draft default tip
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
| author | devteam |
|---|---|
| date | Tue, 13 Oct 2015 12:23:54 -0400 |
| parents | 423bcc3a3785 |
| children |
| rev | line source |
|---|---|
| 1 | 1 <tool id="lda_analy1" name="Perform LDA" version="1.0.1"> |
| 2 <description>Linear Discriminant Analysis</description> | |
| 3 <requirements> | |
| 4 <requirement type="package" version="2.11.0">R</requirement> | |
| 5 </requirements> | |
| 6 <command interpreter="sh">r_wrapper.sh $script_file</command> | |
| 7 <inputs> | |
| 8 <param format="tabular" name="input" type="data" label="Source file"/> | |
|
2
cf85ea165ce0
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
1
diff
changeset
|
9 <param name="cond" type="integer" value="3" label="Number of principal components" help="See TIP below"> |
| 1 | 10 <validator type="empty_field" message="Enter a valid number of principal components, see syntax below for examples"/> |
| 11 </param> | |
| 12 | |
| 13 </inputs> | |
| 14 <outputs> | |
| 15 <data format="txt" name="output" /> | |
| 16 </outputs> | |
| 17 | |
| 18 <tests> | |
| 19 <test> | |
| 20 <param name="input" value="matrix_generator_for_pc_and_lda_output.tabular"/> | |
| 21 <output name="output" file="lda_analy_output.txt"/> | |
| 22 <param name="cond" value="2"/> | |
| 23 | |
| 24 </test> | |
| 25 </tests> | |
| 26 | |
| 27 <configfiles> | |
| 28 <configfile name="script_file"> | |
| 29 | |
| 30 rm(list = objects() ) | |
| 31 | |
| 32 ############# FORMAT X DATA ######################### | |
| 33 format<-function(data) { | |
| 34 ind=NULL | |
| 35 for(i in 1 : ncol(data)){ | |
| 36 if (is.na(data[nrow(data),i])) { | |
| 37 ind<-c(ind,i) | |
| 38 } | |
| 39 } | |
| 40 #print(is.null(ind)) | |
| 41 if (!is.null(ind)) { | |
| 42 data<-data[,-c(ind)] | |
| 43 } | |
| 44 | |
| 45 data | |
| 46 } | |
| 47 | |
| 48 ########GET RESPONSES ############################### | |
| 49 get_resp<- function(data) { | |
| 50 resp1<-as.vector(data[,ncol(data)]) | |
| 51 resp=numeric(length(resp1)) | |
| 52 for (i in 1:length(resp1)) { | |
| 53 if (resp1[i]=="Y ") { | |
| 54 resp[i] = 0 | |
| 55 } | |
| 56 if (resp1[i]=="X ") { | |
| 57 resp[i] = 1 | |
| 58 } | |
| 59 } | |
| 60 return(resp) | |
| 61 } | |
| 62 | |
| 63 ######## CHARS TO NUMBERS ########################### | |
| 64 f_to_numbers<- function(F) { | |
| 65 ind<-NULL | |
| 66 G<-matrix(0,nrow(F), ncol(F)) | |
| 67 for (i in 1:nrow(F)) { | |
| 68 for (j in 1:ncol(F)) { | |
| 69 G[i,j]<-as.integer(F[i,j]) | |
| 70 } | |
| 71 } | |
| 72 return(G) | |
| 73 } | |
| 74 | |
| 75 ###################NORMALIZING######################### | |
| 76 norm <- function(M, a=NULL, b=NULL) { | |
| 77 C<-NULL | |
| 78 ind<-NULL | |
| 79 | |
| 80 for (i in 1: ncol(M)) { | |
| 81 if (sd(M[,i])!=0) { | |
| 82 M[,i]<-(M[,i]-mean(M[,i]))/sd(M[,i]) | |
| 83 } | |
| 84 # else {print(mean(M[,i]))} | |
| 85 } | |
| 86 return(M) | |
| 87 } | |
| 88 | |
| 89 ##### LDA DIRECTIONS ################################# | |
| 90 lda_dec <- function(data, k){ | |
| 91 priors=numeric(k) | |
| 92 grandmean<-numeric(ncol(data)-1) | |
| 93 means=matrix(0,k,ncol(data)-1) | |
| 94 B = matrix(0, ncol(data)-1, ncol(data)-1) | |
| 95 N=nrow(data) | |
| 96 for (i in 1:k){ | |
| 97 priors[i]=sum(data[,1]==i)/N | |
| 98 grp=subset(data,data\$group==i) | |
| 99 means[i,]=mean(grp[,2:ncol(data)]) | |
| 100 #print(means[i,]) | |
| 101 #print(priors[i]) | |
| 102 #print(priors[i]*means[i,]) | |
| 103 grandmean = priors[i]*means[i,] + grandmean | |
| 104 } | |
| 105 | |
| 106 for (i in 1:k) { | |
| 107 B= B + priors[i]*((means[i,]-grandmean)%*%t(means[i,]-grandmean)) | |
| 108 } | |
| 109 | |
| 110 W = var(data[,2:ncol(data)]) | |
| 111 svdW = svd(W) | |
| 112 inv_sqrtW =solve(svdW\$v %*% diag(sqrt(svdW\$d)) %*% t(svdW\$v)) | |
| 113 B_star= t(inv_sqrtW)%*%B%*%inv_sqrtW | |
| 114 B_star_decomp = svd(B_star) | |
| 115 directions = inv_sqrtW%*%B_star_decomp\$v | |
| 116 return( list(directions, B_star_decomp\$d) ) | |
| 117 } | |
| 118 | |
| 119 ################ NAIVE BAYES FOR 1D SIR OR LDA ############## | |
| 120 naive_bayes_classifier <- function(resp, tr_data, test_data, k=2, tau) { | |
| 121 tr_data=data.frame(resp=resp, dir=tr_data) | |
| 122 means=numeric(k) | |
| 123 #print(k) | |
| 124 cl=numeric(k) | |
| 125 predclass=numeric(length(test_data)) | |
| 126 for (i in 1:k) { | |
| 127 grp = subset(tr_data, resp==i) | |
| 128 means[i] = mean(grp\$dir) | |
| 129 #print(i, means[i]) | |
| 130 } | |
| 131 cutoff = tau*means[1]+(1-tau)*means[2] | |
| 132 #print(tau) | |
| 133 #print(means) | |
| 134 #print(cutoff) | |
| 135 if (cutoff>means[1]) { | |
| 136 cl[1]=1 | |
| 137 cl[2]=2 | |
| 138 } | |
| 139 else { | |
| 140 cl[1]=2 | |
| 141 cl[2]=1 | |
| 142 } | |
| 143 | |
| 144 for (i in 1:length(test_data)) { | |
| 145 | |
| 146 if (test_data[i] <= cutoff) { | |
| 147 predclass[i] = cl[1] | |
| 148 } | |
| 149 else { | |
| 150 predclass[i] = cl[2] | |
| 151 } | |
| 152 } | |
| 153 #print(means) | |
| 154 #print(mean(means)) | |
| 155 #X11() | |
| 156 #plot(test_data,pch=predclass, col=resp) | |
| 157 predclass | |
| 158 } | |
| 159 | |
| 160 ################# EXTENDED ERROR RATES ################# | |
| 161 ext_error_rate <- function(predclass, actualclass,msg=c("you forgot the message"), pr=1) { | |
| 162 er=sum(predclass != actualclass)/length(predclass) | |
| 163 | |
| 164 matr<-data.frame(predclass=predclass,actualclass=actualclass) | |
| 165 escapes = subset(matr, actualclass==1) | |
| 166 subjects = subset(matr, actualclass==2) | |
| 167 er_esc=sum(escapes\$predclass != escapes\$actualclass)/length(escapes\$predclass) | |
| 168 er_subj=sum(subjects\$predclass != subjects\$actualclass)/length(subjects\$predclass) | |
| 169 | |
| 170 if (pr==1) { | |
| 171 # print(paste(c(msg, 'overall : ', (1-er)*100, "%."),collapse=" ")) | |
| 172 # print(paste(c(msg, 'within escapes : ', (1-er_esc)*100, "%."),collapse=" ")) | |
| 173 # print(paste(c(msg, 'within subjects: ', (1-er_subj)*100, "%."),collapse=" ")) | |
| 174 } | |
| 175 return(c((1-er)*100, (1-er_esc)*100, (1-er_subj)*100)) | |
| 176 } | |
| 177 | |
| 178 ## Main Function ## | |
| 179 | |
| 180 files<-matrix("${input}", 1,1, byrow=T) | |
| 181 | |
| 182 d<-"${cond}" # Number of PC | |
| 183 | |
| 184 tau<-seq(0,1, by=0.005) | |
| 185 #tau<-seq(0,1, by=0.1) | |
| 186 for_curve=matrix(-10, 3,length(tau)) | |
| 187 | |
| 188 ############################################################## | |
| 189 | |
| 190 test_data_whole_X <-read.delim(files[1,1], row.names=1) | |
| 191 | |
| 192 #### FORMAT TRAINING DATA #################################### | |
| 193 # get only necessary columns | |
| 194 | |
| 195 test_data_whole_X<-format(test_data_whole_X) | |
| 196 oligo_labels<-test_data_whole_X[1:(nrow(test_data_whole_X)-1),ncol(test_data_whole_X)] | |
| 197 test_data_whole_X<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)] | |
| 198 | |
| 199 X_names<-colnames(test_data_whole_X)[1:ncol(test_data_whole_X)] | |
| 200 test_data_whole_X<-t(test_data_whole_X) | |
| 201 resp<-get_resp(test_data_whole_X) | |
| 202 ldaqda_resp = resp + 1 | |
| 203 a<-sum(resp) # Number of Subject | |
| 204 b<-length(resp) - a # Number of Escape | |
| 205 ## FREQUENCIES ################################################# | |
| 206 F<-test_data_whole_X[,1:(ncol(test_data_whole_X)-1)] | |
| 207 F<-f_to_numbers(F) | |
| 208 FN<-norm(F, a, b) | |
| 209 ss<-svd(FN) | |
| 210 eigvar<-NULL | |
| 211 eig<-ss\$d^2 | |
| 212 | |
| 213 for ( i in 1:length(ss\$d)) { | |
| 214 eigvar[i]<-sum(eig[1:i])/sum(eig) | |
| 215 } | |
| 216 | |
| 217 #print(paste(c("Variance explained : ", eigvar[d]*100, "%"), collapse="")) | |
| 218 | |
| 219 Z<-F%*%ss\$v | |
| 220 | |
| 221 ldaqda_data <- data.frame(group=ldaqda_resp,Z[,1:d]) | |
| 222 lda_dir<-lda_dec(ldaqda_data,2) | |
| 223 train_lda_pred <-Z[,1:d]%*%lda_dir[[1]] | |
| 224 | |
| 225 ############# NAIVE BAYES CROSS-VALIDATION ############# | |
| 226 ### LDA ##### | |
| 227 | |
| 228 y<-ldaqda_resp | |
| 229 X<-F | |
| 230 cv<-matrix(c(rep('NA',nrow(test_data_whole_X))), nrow(test_data_whole_X), length(tau)) | |
| 231 for (i in 1:nrow(test_data_whole_X)) { | |
| 232 # print(i) | |
| 233 resp<-y[-i] | |
| 234 p<-matrix(X[-i,], dim(X)[1]-1, dim(X)[2]) | |
| 235 testdata<-matrix(X[i,],1,dim(X)[2]) | |
| 236 p1<-norm(p) | |
| 237 sss<-svd(p1) | |
| 238 pred<-(p%*%sss\$v)[,1:d] | |
| 239 test<- (testdata%*%sss\$v)[,1:d] | |
| 240 lda <- lda_dec(data.frame(group=resp,pred),2) | |
| 241 pred <- pred[,1:d]%*%lda[[1]][,1] | |
| 242 test <- test%*%lda[[1]][,1] | |
| 243 test<-matrix(test, 1, length(test)) | |
| 244 for (t in 1:length(tau)) { | |
| 245 cv[i, t] <- naive_bayes_classifier (resp, pred, test,k=2, tau[t]) | |
| 246 } | |
| 247 } | |
| 248 | |
| 249 for (t in 1:length(tau)) { | |
| 250 tr_err<-ext_error_rate(cv[,t], ldaqda_resp , c("CV"), 1) | |
| 251 for_curve[1:3,t]<-tr_err | |
| 252 } | |
| 253 | |
| 254 dput(for_curve, file="${output}") | |
| 255 | |
| 256 | |
| 257 </configfile> | |
| 258 </configfiles> | |
| 259 | |
| 260 <help> | |
| 261 | |
| 262 .. class:: infomark | |
| 263 | |
| 264 **TIP:** If you want to perform Principal Component Analysis (PCA) on the give numeric input data (which corresponds to the "Source file First in "Generate A Matrix" tool), please use *Multivariate Analysis/Principal Component Analysis* | |
| 265 | |
| 266 ----- | |
| 267 | |
| 268 .. class:: infomark | |
| 269 | |
| 270 **What it does** | |
| 271 | |
| 272 This tool consists of the module to perform the Linear Discriminant Analysis as described in Carrel et al., 2006 (PMID: 17009873) | |
| 273 | |
| 274 *Carrel L, Park C, Tyekucheva S, Dunn J, Chiaromonte F, et al. (2006) Genomic Environment Predicts Expression Patterns on the Human Inactive X Chromosome. PLoS Genet 2(9): e151. doi:10.1371/journal.pgen.0020151* | |
| 275 | |
| 276 ----- | |
| 277 | |
| 278 .. class:: warningmark | |
| 279 | |
| 280 **Note** | |
| 281 | |
| 282 - Output from "Generate A Matrix" tool is used as input file for this tool | |
| 283 - Output of this tool contains LDA classification success rates for different values of the turning parameter tau (from 0 to 1 with 0.005 interval). This output file will be used to establish the ROC plot, and you can obtain more detail information from this plot. | |
| 284 | |
| 285 | |
| 286 </help> | |
| 287 | |
| 288 </tool> |
