Mercurial > repos > bgruening > create_tool_recommendation_model
comparison utils.py @ 0:22ebbac136c7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
| author | bgruening |
|---|---|
| date | Wed, 28 Aug 2019 07:19:13 -0400 |
| parents | |
| children | 50753817983a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:22ebbac136c7 |
|---|---|
| 1 import os | |
| 2 import numpy as np | |
| 3 import json | |
| 4 import h5py | |
| 5 | |
| 6 from keras.models import model_from_json, Sequential | |
| 7 from keras.layers import Dense, GRU, Dropout | |
| 8 from keras.layers.embeddings import Embedding | |
| 9 from keras.layers.core import SpatialDropout1D | |
| 10 from keras.optimizers import RMSprop | |
| 11 from keras import backend as K | |
| 12 | |
| 13 | |
| 14 def read_file(file_path): | |
| 15 """ | |
| 16 Read a file | |
| 17 """ | |
| 18 with open(file_path, "r") as json_file: | |
| 19 file_content = json.loads(json_file.read()) | |
| 20 return file_content | |
| 21 | |
| 22 | |
| 23 def write_file(file_path, content): | |
| 24 """ | |
| 25 Write a file | |
| 26 """ | |
| 27 remove_file(file_path) | |
| 28 with open(file_path, "w") as json_file: | |
| 29 json_file.write(json.dumps(content)) | |
| 30 | |
| 31 | |
| 32 def save_processed_workflows(file_path, unique_paths): | |
| 33 workflow_paths_unique = "" | |
| 34 for path in unique_paths: | |
| 35 workflow_paths_unique += path + "\n" | |
| 36 with open(file_path, "w") as workflows_file: | |
| 37 workflows_file.write(workflow_paths_unique) | |
| 38 | |
| 39 | |
| 40 def load_saved_model(model_config, model_weights): | |
| 41 """ | |
| 42 Load the saved trained model using the saved network and its weights | |
| 43 """ | |
| 44 # load the network | |
| 45 loaded_model = model_from_json(model_config) | |
| 46 # load the saved weights into the model | |
| 47 loaded_model.set_weights(model_weights) | |
| 48 return loaded_model | |
| 49 | |
| 50 | |
| 51 def format_tool_id(tool_link): | |
| 52 """ | |
| 53 Extract tool id from tool link | |
| 54 """ | |
| 55 tool_id_split = tool_link.split("/") | |
| 56 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link | |
| 57 return tool_id | |
| 58 | |
| 59 | |
| 60 def get_HDF5(hf, d_key): | |
| 61 """ | |
| 62 Read h5 file to get train and test data | |
| 63 """ | |
| 64 return hf.get(d_key).value | |
| 65 | |
| 66 | |
| 67 def save_HDF5(hf_file, d_key, data, d_type=""): | |
| 68 """ | |
| 69 Save datasets as h5 file | |
| 70 """ | |
| 71 if (d_type == 'json'): | |
| 72 data = json.dumps(data) | |
| 73 hf_file.create_dataset(d_key, data=data) | |
| 74 | |
| 75 | |
| 76 def set_trained_model(dump_file, model_values): | |
| 77 """ | |
| 78 Create an h5 file with the trained weights and associated dicts | |
| 79 """ | |
| 80 hf_file = h5py.File(dump_file, 'w') | |
| 81 for key in model_values: | |
| 82 value = model_values[key] | |
| 83 if key == 'model_weights': | |
| 84 for idx, item in enumerate(value): | |
| 85 w_key = "weight_" + str(idx) | |
| 86 if w_key in hf_file: | |
| 87 hf_file.modify(w_key, item) | |
| 88 else: | |
| 89 hf_file.create_dataset(w_key, data=item) | |
| 90 else: | |
| 91 if key in hf_file: | |
| 92 hf_file.modify(key, json.dumps(value)) | |
| 93 else: | |
| 94 hf_file.create_dataset(key, data=json.dumps(value)) | |
| 95 hf_file.close() | |
| 96 | |
| 97 | |
| 98 def remove_file(file_path): | |
| 99 if os.path.exists(file_path): | |
| 100 os.remove(file_path) | |
| 101 | |
| 102 | |
| 103 def extract_configuration(config_object): | |
| 104 config_loss = dict() | |
| 105 for index, item in enumerate(config_object): | |
| 106 config_loss[index] = list() | |
| 107 d_config = dict() | |
| 108 d_config['loss'] = item['result']['loss'] | |
| 109 d_config['params_config'] = item['misc']['vals'] | |
| 110 config_loss[index].append(d_config) | |
| 111 return config_loss | |
| 112 | |
| 113 | |
| 114 def get_best_parameters(mdl_dict): | |
| 115 """ | |
| 116 Get param values (defaults as well) | |
| 117 """ | |
| 118 lr = float(mdl_dict.get("learning_rate", "0.001")) | |
| 119 embedding_size = int(mdl_dict.get("embedding_size", "512")) | |
| 120 dropout = float(mdl_dict.get("dropout", "0.2")) | |
| 121 recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2")) | |
| 122 spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2")) | |
| 123 units = int(mdl_dict.get("units", "512")) | |
| 124 batch_size = int(mdl_dict.get("batch_size", "512")) | |
| 125 activation_recurrent = mdl_dict.get("activation_recurrent", "elu") | |
| 126 activation_output = mdl_dict.get("activation_output", "sigmoid") | |
| 127 | |
| 128 return { | |
| 129 "lr": lr, | |
| 130 "embedding_size": embedding_size, | |
| 131 "dropout": dropout, | |
| 132 "recurrent_dropout": recurrent_dropout, | |
| 133 "spatial_dropout": spatial_dropout, | |
| 134 "units": units, | |
| 135 "batch_size": batch_size, | |
| 136 "activation_recurrent": activation_recurrent, | |
| 137 "activation_output": activation_output, | |
| 138 } | |
| 139 | |
| 140 | |
| 141 def weighted_loss(class_weights): | |
| 142 """ | |
| 143 Create a weighted loss function. Penalise the misclassification | |
| 144 of classes more with the higher usage | |
| 145 """ | |
| 146 weight_values = list(class_weights.values()) | |
| 147 | |
| 148 def weighted_binary_crossentropy(y_true, y_pred): | |
| 149 # add another dimension to compute dot product | |
| 150 expanded_weights = K.expand_dims(weight_values, axis=-1) | |
| 151 return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights) | |
| 152 return weighted_binary_crossentropy | |
| 153 | |
| 154 | |
| 155 def set_recurrent_network(mdl_dict, reverse_dictionary, class_weights): | |
| 156 """ | |
| 157 Create a RNN network and set its parameters | |
| 158 """ | |
| 159 dimensions = len(reverse_dictionary) + 1 | |
| 160 model_params = get_best_parameters(mdl_dict) | |
| 161 | |
| 162 # define the architecture of the neural network | |
| 163 model = Sequential() | |
| 164 model.add(Embedding(dimensions, model_params["embedding_size"], mask_zero=True)) | |
| 165 model.add(SpatialDropout1D(model_params["spatial_dropout"])) | |
| 166 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=True)) | |
| 167 model.add(Dropout(model_params["dropout"])) | |
| 168 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False)) | |
| 169 model.add(Dropout(model_params["dropout"])) | |
| 170 model.add(Dense(dimensions, activation=model_params["activation_output"])) | |
| 171 optimizer = RMSprop(lr=model_params["lr"]) | |
| 172 model.compile(loss=weighted_loss(class_weights), optimizer=optimizer) | |
| 173 return model, model_params | |
| 174 | |
| 175 | |
| 176 def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk): | |
| 177 """ | |
| 178 Compute absolute and compatible precision | |
| 179 """ | |
| 180 absolute_precision = 0.0 | |
| 181 test_sample = np.reshape(x, (1, len(x))) | |
| 182 | |
| 183 # predict next tools for a test path | |
| 184 prediction = model.predict(test_sample, verbose=0) | |
| 185 | |
| 186 nw_dimension = prediction.shape[1] | |
| 187 | |
| 188 # remove the 0th position as there is no tool at this index | |
| 189 prediction = np.reshape(prediction, (nw_dimension,)) | |
| 190 | |
| 191 prediction_pos = np.argsort(prediction, axis=-1) | |
| 192 topk_prediction_pos = prediction_pos[-topk:] | |
| 193 | |
| 194 # remove the wrong tool position from the predicted list of tool positions | |
| 195 topk_prediction_pos = [x for x in topk_prediction_pos if x > 0] | |
| 196 | |
| 197 # read tool names using reverse dictionary | |
| 198 actual_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in actual_classes_pos] | |
| 199 top_predicted_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in topk_prediction_pos] | |
| 200 | |
| 201 # compute the class weights of predicted tools | |
| 202 mean_usg_score = 0 | |
| 203 usg_wt_scores = list() | |
| 204 for t_id in topk_prediction_pos: | |
| 205 t_name = reverse_data_dictionary[int(t_id)] | |
| 206 if t_id in usage_scores and t_name in actual_next_tool_names: | |
| 207 usg_wt_scores.append(np.log(usage_scores[t_id] + 1.0)) | |
| 208 if len(usg_wt_scores) > 0: | |
| 209 mean_usg_score = np.sum(usg_wt_scores) / float(topk) | |
| 210 false_positives = [tool_name for tool_name in top_predicted_next_tool_names if tool_name not in actual_next_tool_names] | |
| 211 absolute_precision = 1 - (len(false_positives) / float(topk)) | |
| 212 return mean_usg_score, absolute_precision | |
| 213 | |
| 214 | |
| 215 def verify_model(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, topk_list=[1, 2, 3]): | |
| 216 """ | |
| 217 Verify the model on test data | |
| 218 """ | |
| 219 print("Evaluating performance on test data...") | |
| 220 print("Test data size: %d" % len(y)) | |
| 221 size = y.shape[0] | |
| 222 precision = np.zeros([len(y), len(topk_list)]) | |
| 223 usage_weights = np.zeros([len(y), len(topk_list)]) | |
| 224 # loop over all the test samples and find prediction precision | |
| 225 for i in range(size): | |
| 226 actual_classes_pos = np.where(y[i] > 0)[0] | |
| 227 for index, abs_topk in enumerate(topk_list): | |
| 228 abs_mean_usg_score, absolute_precision = compute_precision(model, x[i, :], y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, abs_topk) | |
| 229 precision[i][index] = absolute_precision | |
| 230 usage_weights[i][index] = abs_mean_usg_score | |
| 231 mean_precision = np.mean(precision, axis=0) | |
| 232 mean_usage = np.mean(usage_weights, axis=0) | |
| 233 return mean_precision, mean_usage | |
| 234 | |
| 235 | |
| 236 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights): | |
| 237 # save files | |
| 238 trained_model = results["model"] | |
| 239 best_model_parameters = results["best_parameters"] | |
| 240 model_config = trained_model.to_json() | |
| 241 model_weights = trained_model.get_weights() | |
| 242 | |
| 243 model_values = { | |
| 244 'data_dictionary': data_dictionary, | |
| 245 'model_config': model_config, | |
| 246 'best_parameters': best_model_parameters, | |
| 247 'model_weights': model_weights, | |
| 248 "compatible_tools": compatible_next_tools, | |
| 249 "class_weights": class_weights | |
| 250 } | |
| 251 set_trained_model(trained_model_path, model_values) |
