Mercurial > repos > galaxyp > cardinal_classification
changeset 11:8abcfbb033a1 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cardinal commit 888b3e991d0752b694bf480531ce0e5318c2f337-dirty"
| author | galaxyp | 
|---|---|
| date | Fri, 07 May 2021 09:46:01 +0000 | 
| parents | 4672252da0a0 | 
| children | fbcc2ab88045 | 
| files | classification.xml test-data/pixels_test6.tabular test-data/test6.pdf test-data/test6.rdata | 
| diffstat | 4 files changed, 152 insertions(+), 99 deletions(-) [+] | 
line wrap: on
 line diff
--- a/classification.xml Fri Feb 19 18:45:59 2021 +0000 +++ b/classification.xml Fri May 07 09:46:01 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="cardinal_classification" name="MSI classification" version="@VERSION@.0"> +<tool id="cardinal_classification" name="MSI classification" version="@VERSION@.1"> <description>spatial classification of mass spectrometry imaging data</description> <macros> <import>macros.xml</import> @@ -25,7 +25,7 @@ library(Cardinal) library(gridExtra) library(ggplot2) - +library(scales) @READING_MSIDATA@ @@ -57,7 +57,7 @@ ##################### I) numbers and control plots ############################# -############################################################################### +################################################################################ ## table with values grid.table(property_df, rows= NULL) @@ -67,7 +67,7 @@ opar <- par() - ######################## II) Training ############################# + ######################## II) Training ####################################### ############################################################################# #if str( $type_cond.type_method) == "training": print("training") @@ -90,21 +90,26 @@ merged_response = merge(msidata_coordinates, y_input, by=c("x", "y"), all.x=TRUE) merged_response[is.na(merged_response)] = "NA" merged_response = merged_response[order(merged_response\$pixel_index),] - y_vector = as.factor(merged_response[,4]) + conditions = as.factor(merged_response[,4]) + y_vector = conditions ## plot of y vector - position_df = cbind(coord(msidata)[,1:2], y_vector) - y_plot = ggplot(position_df, aes(x=x, y=y, fill=y_vector))+ + position_df = cbind(coord(msidata)[,1:2], conditions) + y_plot = ggplot(position_df, aes(x=x, y=y, fill=conditions))+ geom_tile() + coord_fixed()+ - ggtitle("Distribution of the response variable y")+ - theme_bw()+ + ggtitle("Distribution of the conditions")+ + theme_bw()+ + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ theme(text=element_text(family="ArialMT", face="bold", size=15))+ theme(legend.position="bottom",legend.direction="vertical")+ guides(fill=guide_legend(ncol=4,byrow=TRUE)) - coord_labels = aggregate(cbind(x,y)~y_vector, data=position_df, mean, na.rm=TRUE, na.action="na.pass") - coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$y_vector) + coord_labels = aggregate(cbind(x,y)~conditions, data=position_df, mean, na.rm=TRUE, na.action="na.pass") + coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$conditions) print(y_plot) @@ -119,7 +124,11 @@ geom_tile() + coord_fixed()+ ggtitle("Distribution of the fold variable")+ - theme_bw()+ + theme_bw()+ + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ theme(text=element_text(family="ArialMT", face="bold", size=15))+ theme(legend.position="bottom",legend.direction="vertical")+ guides(fill=guide_legend(ncol=4,byrow=TRUE)) @@ -276,7 +285,11 @@ geom_tile() + coord_fixed()+ ggtitle("Predicted condition for each pixel")+ - theme_bw()+ + theme_bw()+ + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ theme(text=element_text(family="ArialMT", face="bold", size=15))+ theme(legend.position="bottom",legend.direction="vertical")+ guides(fill=guide_legend(ncol=4,byrow=TRUE)) @@ -443,7 +456,11 @@ geom_tile() + coord_fixed()+ ggtitle("Predicted condition for each pixel")+ - theme_bw()+ + theme_bw()+ + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ theme(text=element_text(family="ArialMT", face="bold", size=15))+ theme(legend.position="bottom",legend.direction="vertical")+ guides(fill=guide_legend(ncol=4,byrow=TRUE)) @@ -507,6 +524,11 @@ maximumy = max(coord(msidata.cv.ssc)[,2]) image(msidata.cv.ssc, model = list( r = r_value, s = s_value ), ylim= c(maximumy+0.2*maximumy,minimumy-0.2*minimumy),layout=c(1,1)) + #if $type_cond.method_cond.ssc_analysis_cond.write_best_params: + write.table(r_value, file="$best_r", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t") + write.table(s_value, file="$best_s", quote = FALSE, row.names = FALSE, col.names=FALSE, sep = "\t") + #end if + ## print table with summary in pdf par(opar) plot(0,type='n',axes=FALSE,ann=FALSE) @@ -542,9 +564,12 @@ number_groups = length(levels(y_vector)) ## SSC analysis and plot - msidata.ssc <- spatialShrunkenCentroids(msidata, y = y_vector, .fold = fold_vector, + msidata.ssc <- spatialShrunkenCentroids(msidata, y = y_vector, r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s), method = "$type_cond.method_cond.ssc_kernel_method") - plot(msidata.ssc, mode = "tstatistics", model = list("r" = c($type_cond.method_cond.ssc_r), "s" = c($type_cond.method_cond.ssc_s))) + plot(msidata.ssc, mode = "tstatistics", model = list("r" = c($type_cond.method_cond.ssc_r), "s" = c($type_cond.method_cond.ssc_s)), + col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), lwd=2) + + ### summary table SSC ##############summary_table = summary(msidata.ssc) @@ -582,6 +607,7 @@ ## m/z and pixel information output ssc_classes = data.frame(msidata.ssc\$classes[[1]]) + ssc_probabilities = data.frame(msidata.ssc\$probabilities[[1]]) ## pixel names and coordinates ## to remove potential sample names and z dimension, split at comma and take only x and y @@ -596,39 +622,36 @@ rm(msidata) gc() - ssc_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, ssc_classes) - colnames(ssc_classes2) = c("pixel names", "x", "y","predicted condition") + ssc_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, ssc_classes, ssc_probabilities) + colnames(ssc_classes2) = c("pixel names", "x", "y","predicted condition", levels(msidata.ssc\$classes[[1]])) ssc_toplabels = topFeatures(msidata.ssc, n=Inf) ssc_toplabels[,6:9] <-round(ssc_toplabels[,6:9],6) write.table(ssc_toplabels, file="$mzfeatures", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t") write.table(ssc_classes2, file="$pixeloutput", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t") - - ## image with predicted classes - prediction_df = cbind(coord(msidata.ssc)[,1:2], ssc_classes) - colnames(prediction_df) = c("x", "y", "predicted_classes") + + image(msidata.ssc, model=list(r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s)), + col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), mode="classes", layout=c(1,1), main="Class Prediction") + image(msidata.ssc, model=list(r = c($type_cond.method_cond.ssc_r), s = c($type_cond.method_cond.ssc_s)), + col=hue_pal()(length(levels(msidata.ssc\$classes[[1]]))), mode="probabilities", layout=c(1,1), main="Class probabilities") - prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+ - geom_tile() + - coord_fixed()+ - ggtitle("Predicted condition for each pixel")+ - theme_bw()+ - theme(text=element_text(family="ArialMT", face="bold", size=15))+ - theme(legend.position="bottom",legend.direction="vertical")+ - guides(fill=guide_legend(ncol=4,byrow=TRUE)) - coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass") - coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes) - print(prediction_plot) - ## image with right and wrong classes: + prediction_df = cbind(coord(msidata.ssc)[,1:2], ssc_classes) + colnames(prediction_df) = c("x", "y", "predicted_classes") comparison_df = cbind(prediction_df, y_vector) comparison_df\$correct<- ifelse(comparison_df\$predicted_classes==comparison_df\$y_vector, T, F) + correctness = round(sum(comparison_df\$correct)/length(comparison_df\$correct)*100,2) correctness_plot = ggplot(comparison_df, aes(x=x, y=y, fill=correct))+ geom_tile() + coord_fixed()+ - ggtitle("Correctness of classification")+ - theme_bw()+ + ggtitle(paste0("Correctness of classification: ",correctness, "%"))+ + scale_fill_manual(values = c("TRUE" = "orange","FALSE" = "darkblue"))+ + theme_bw()+ + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ theme(text=element_text(family="ArialMT", face="bold", size=15))+ theme(legend.position="bottom",legend.direction="vertical")+ guides(fill=guide_legend(ncol=2,byrow=TRUE)) @@ -668,6 +691,16 @@ merged_response = merged_response[order(merged_response\$pixel_index),] new_y_vector = as.factor(merged_response[,4]) prediction = predict(training_data,msidata, newy = new_y_vector) + + ## Summary table prediction + summary_table = summary(prediction)\$accuracy[[names(prediction@resultData)]] + summary_table2 = round(as.numeric(summary_table), digits=2) + summary_matrix = matrix(summary_table2, nrow=4, ncol=ncol(summary_table)) + summary_table3 = cbind(rownames(summary_table), summary_matrix) ## include rownames in table + summary_table4 = t(summary_table3) + summary_table5 = cbind(c(names(prediction@resultData),colnames(summary_table)), summary_table4) + plot(0,type='n',axes=FALSE,ann=FALSE) + grid.table(summary_table5, rows= NULL) #else prediction = predict(training_data,msidata) @@ -684,57 +717,65 @@ predicted_toplabels = topFeatures(prediction, n=Inf) if (colnames(predicted_toplabels)[4] == "coefficients"){ predicted_toplabels[,4:6] <-round(predicted_toplabels[,4:6],5) - }else{ predicted_toplabels[,6:9] <-round(predicted_toplabels[,6:9],5)} + + ##predicted classes + prediction_df = cbind(coord(prediction)[,1:2], predicted_classes) + colnames(prediction_df) = c("x", "y", "predicted_classes") + + #if str($type_cond.classification_type) == "SSC_classifier": + ## this seems to work only for SSC, therefore overwrite tables + predicted_probabilities = data.frame(prediction\$probabilities[[1]]) + predicted_classes2 = data.frame(pixel_names, x_coordinates, y_coordinates, predicted_classes, predicted_probabilities) + colnames(predicted_classes2) = c("pixel names", "x", "y","predicted condition", levels(prediction\$classes[[1]])) + ## also image modes are specific to SSC + image(prediction, mode="classes", layout=c(1,1), main="Class", col=hue_pal()(length(unique(prediction\$classes[[1]])))) + image(prediction, mode="probabilities", layout=c(1,1), main="Class probabilities", col=hue_pal()(length(unique(prediction\$classes[[1]])))) + + #else + + prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+ + geom_tile()+ + coord_fixed()+ + ggtitle("Predicted condition for each spectrum")+ + theme_bw()+ + theme( + plot.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank())+ + theme(text=element_text(family="ArialMT", face="bold", size=15))+ + theme(legend.position="bottom", legend.direction="vertical")+ + guides(fill=guide_legend(ncol=4, byrow=TRUE)) + coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass") + coord_labels\$file_number = gsub( "_.*ยง", "", coord_labels\$predicted_classes) + print(prediction_plot) + #end if + write.table(predicted_toplabels, file="$mzfeatures", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t") write.table(predicted_classes2, file="$pixeloutput", quote = FALSE, row.names = FALSE, col.names=TRUE, sep = "\t") - ## image with predicted classes - prediction_df = cbind(coord(prediction)[,1:2], predicted_classes) - colnames(prediction_df) = c("x", "y", "predicted_classes") - prediction_plot = ggplot(prediction_df, aes(x=x, y=y, fill=predicted_classes))+ - geom_tile() + - coord_fixed()+ - ggtitle("Predicted condition for each pixel")+ - theme_bw()+ - theme(text=element_text(family="ArialMT", face="bold", size=15))+ - theme(legend.position="bottom",legend.direction="vertical")+ - guides(fill=guide_legend(ncol=4,byrow=TRUE)) - coord_labels = aggregate(cbind(x,y)~predicted_classes, data=prediction_df, mean, na.rm=TRUE, na.action="na.pass") - coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes) - print(prediction_plot) + #if str($type_cond.new_y_values_cond.new_y_values) == "new_response": ## image with right and wrong classes: comparison_df = cbind(prediction_df, new_y_vector) - comparison_df\$correct<- as.factor(ifelse(comparison_df\$predicted_classes==comparison_df\$new_y_vector, T, F)) + comparison_df\$correct<- ifelse(comparison_df\$predicted_classes==comparison_df\$new_y_vector, T, F) + correctness = round(sum(comparison_df\$correct)/length(comparison_df\$correct)*100,2) correctness_plot = ggplot(comparison_df, aes(x=x, y=y, fill=correct))+ geom_tile()+ scale_fill_manual(values = c("TRUE" = "orange","FALSE" = "darkblue"))+ coord_fixed()+ - ggtitle("Correctness of classification")+ + ggtitle(paste0("Correctness of classification: ",correctness, "%"))+ theme_bw()+ theme(text=element_text(family="ArialMT", face="bold", size=15))+ theme(legend.position="bottom",legend.direction="vertical")+ guides(fill=guide_legend(ncol=2,byrow=TRUE)) - ## coord_labels = aggregate(cbind(x,y)~correct, data=comparison_df, mean, na.rm=TRUE, na.action="na.pass") - ##coord_labels\$file_number = gsub( "_.*$", "", coord_labels\$predicted_classes) print(correctness_plot) - - - ## Summary table prediction - summary_table = summary(prediction)\$accuracy[[names(prediction@resultData)]] - summary_table2 = round(as.numeric(summary_table), digits=2) - summary_matrix = matrix(summary_table2, nrow=4, ncol=ncol(summary_table)) - summary_table3 = cbind(rownames(summary_table), summary_matrix) ## include rownames in table - summary_table4 = t(summary_table3) - summary_table5 = cbind(c(names(prediction@resultData),colnames(summary_table)), summary_table4) - plot(0,type='n',axes=FALSE,ann=FALSE) - grid.table(summary_table5, rows= NULL) + #end if ## optional output as .RData #if $output_rdata: @@ -833,8 +874,9 @@ <option value="ssc_cvapply" selected="True">cvApply</option> <option value="ssc_analysis">spatial shrunken centroids analysis</option> </param> - <when value="ssc_cvapply"/> - + <when value="ssc_cvapply"> + <param name="write_best_params" type="boolean" label="Write out best r and s values" help="Can be used to generate automatic classification workflow"/> + </when> <when value="ssc_analysis"> <!--param name="ssc_toplabels" type="integer" value="100" label="Number of toplabels (m/z features) which should be written in tabular output"/--> @@ -851,8 +893,7 @@ <param name="ssc_kernel_method" type="select" display="radio" label = "The method to use to calculate the spatial smoothing kernels for the embedding. The 'gaussian' method refers to spatially-aware (SA) weights, and 'adaptive' refers to spatially-aware structurally-adaptive (SASA) weights"> <option value="gaussian">gaussian</option> <option value="adaptive" selected="True">adaptive</option> - </param> - + </param> </when> </conditional> @@ -862,10 +903,15 @@ <param name="training_result" type="data" format="rdata" label="Result from previous classification training"/> <!--param name="predicted_toplabels" type="integer" value="100" label="Number of toplabels (m/z features) which should be written in tabular output"/--> + <param name="classification_type" type="select" display="radio" optional="False" label="Which classification method was used"> + <option value="PLS_classifier" selected="True" >PLS classifier</option> + <option value="OPLS_classifier">OPLS classifier</option> + <option value="SSC_classifier">SSC_classifier</option> + </param> <conditional name="new_y_values_cond"> - <param name="new_y_values" type="select" label="Should new response values be used"> - <option value="no_new_response" selected="True">old response should be used</option> - <option value="new_response">load new response from tabular file</option> + <param name="new_y_values" type="select" label="Load annotations (optional, but allows accuracy calculations)"> + <option value="no_new_response" selected="True">no</option> + <option value="new_response">use annotations</option> </param> <when value="no_new_response"/> <when value="new_response"> @@ -884,6 +930,12 @@ <data format="pdf" name="classification_images" from_work_dir="classificationpdf.pdf" label = "${tool.name} on ${on_string}: results"/> <data format="tabular" name="mzfeatures" label="${tool.name} on ${on_string}: features"/> <data format="tabular" name="pixeloutput" label="${tool.name} on ${on_string}: pixels"/> + <data format="txt" name="best_r" label="${tool.name} on ${on_string}:best r"> + <filter>type_cond['type_method'] == 'training' and type_cond['method_cond']['class_method'] == 'spatialShrunkenCentroids' and type_cond['method_cond']['ssc_analysis_cond']['ssc_method'] == 'ssc_cvapply' and type_cond['method_cond']['ssc_analysis_cond']['write_best_params']</filter> + </data> + <data format="txt" name="best_s" label="${tool.name} on ${on_string}:best s"> + <filter>type_cond['type_method'] == 'training' and type_cond['method_cond']['class_method'] == 'spatialShrunkenCentroids' and type_cond['method_cond']['ssc_analysis_cond']['ssc_method'] == 'ssc_cvapply' and type_cond['method_cond']['ssc_analysis_cond']['write_best_params']</filter> + </data> <data format="rdata" name="classification_rdata" label="${tool.name} on ${on_string}: results.RData"> <filter>output_rdata</filter> </data> @@ -1101,8 +1153,9 @@ - training and prediction - training can be done with cvapply that uses cross validation to find the best value for s, this requires not only a condition for each spectrum but also a fold (each fold should contain spectra of all conditions) - - training with the best value for s gives the top m/z features for each condition and the predicted classification group for each spectrum + - training with the best value for r and s gives the top m/z features for each condition and the predicted classification group for each spectrum - training result can be saved as RData file that can be reused for prediction of further samples + - prediction can calculate accuracies when the annotations are known and provided .. image:: $PATH_TO_IMAGES/classification_overview.png
--- a/test-data/pixels_test6.tabular Fri Feb 19 18:45:59 2021 +0000 +++ b/test-data/pixels_test6.tabular Fri May 07 09:46:01 2021 +0000 @@ -1,25 +1,25 @@ -pixel names x y predicted condition -xy_1_1 1 1 A -xy_2_1 2 1 A -xy_3_1 3 1 B -xy_4_1 4 1 C -xy_1_2 1 2 C -xy_2_2 2 2 C -xy_3_2 3 2 A -xy_4_2 4 2 A -xy_1_3 1 3 A -xy_2_3 2 3 B -xy_3_3 3 3 C -xy_4_3 4 3 A -xy_10_1 10 1 C -xy_11_1 11 1 C -xy_12_1 12 1 C -xy_13_1 13 1 B -xy_10_2 10 2 C -xy_11_2 11 2 B -xy_12_2 12 2 C -xy_13_2 13 2 C -xy_10_3 10 3 C -xy_11_3 11 3 C -xy_12_3 12 3 B -xy_13_3 13 3 C +pixel names x y predicted condition A B C +xy_1_1 1 1 A 0.434439526064797 0.195646317191818 0.369914156743386 +xy_2_1 2 1 A 0.38219998209377 0.242372158141275 0.375427859764956 +xy_3_1 3 1 B 0.312531499299517 0.385612104162858 0.301856396537625 +xy_4_1 4 1 C 0.393153488582866 0.191107087820634 0.4157394235965 +xy_1_2 1 2 C 0.366986470447772 0.216121568441093 0.416891961111135 +xy_2_2 2 2 C 0.381682206547616 0.213188918797062 0.405128874655322 +xy_3_2 3 2 A 0.376695037169723 0.260689491088564 0.362615471741713 +xy_4_2 4 2 A 0.42305935188829 0.174038449100755 0.402902199010954 +xy_1_3 1 3 A 0.382420991383021 0.249364697048677 0.368214311568302 +xy_2_3 2 3 B 0.272145998315727 0.446525938567718 0.281328063116555 +xy_3_3 3 3 C 0.36296987427851 0.255631013944556 0.381399111776934 +xy_4_3 4 3 A 0.444812272103175 0.132274264153212 0.422913463743613 +xy_10_1 10 1 C 0.376216993893763 0.227584528606788 0.39619847749945 +xy_11_1 11 1 C 0.358430578177403 0.236120068794936 0.405449353027661 +xy_12_1 12 1 C 0.359751662628136 0.218620985552221 0.421627351819643 +xy_13_1 13 1 B 0.101486342705225 0.813997511218961 0.0845161460758142 +xy_10_2 10 2 C 0.354612526523361 0.272635192773437 0.372752280703202 +xy_11_2 11 2 B 0.291635599769993 0.444466545540823 0.263897854689184 +xy_12_2 12 2 C 0.36763798979782 0.203911653614431 0.428450356587749 +xy_13_2 13 2 C 0.344608135177236 0.304026642707691 0.351365222115073 +xy_10_3 10 3 C 0.37046458150651 0.205561286708086 0.423974131785404 +xy_11_3 11 3 C 0.358113833435286 0.262878459144526 0.379007707420187 +xy_12_3 12 3 B 0.180921926305915 0.66902588624642 0.150052187447665 +xy_13_3 13 3 C 0.378266307042675 0.20859472985319 0.413138963104135
