Mercurial > repos > bgruening > create_tool_recommendation_model
comparison utils.py @ 2:50753817983a draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
author | bgruening |
---|---|
date | Sat, 09 May 2020 09:38:04 +0000 |
parents | 22ebbac136c7 |
children | 98bc44d17561 |
comparison
equal
deleted
inserted
replaced
1:275e98795e99 | 2:50753817983a |
---|---|
1 import os | 1 import os |
2 import numpy as np | 2 import numpy as np |
3 import json | 3 import json |
4 import h5py | 4 import h5py |
5 | 5 import random |
6 from keras.models import model_from_json, Sequential | 6 |
7 from keras.layers import Dense, GRU, Dropout | |
8 from keras.layers.embeddings import Embedding | |
9 from keras.layers.core import SpatialDropout1D | |
10 from keras.optimizers import RMSprop | |
11 from keras import backend as K | 7 from keras import backend as K |
12 | 8 |
13 | 9 |
14 def read_file(file_path): | 10 def read_file(file_path): |
15 """ | 11 """ |
18 with open(file_path, "r") as json_file: | 14 with open(file_path, "r") as json_file: |
19 file_content = json.loads(json_file.read()) | 15 file_content = json.loads(json_file.read()) |
20 return file_content | 16 return file_content |
21 | 17 |
22 | 18 |
23 def write_file(file_path, content): | |
24 """ | |
25 Write a file | |
26 """ | |
27 remove_file(file_path) | |
28 with open(file_path, "w") as json_file: | |
29 json_file.write(json.dumps(content)) | |
30 | |
31 | |
32 def save_processed_workflows(file_path, unique_paths): | |
33 workflow_paths_unique = "" | |
34 for path in unique_paths: | |
35 workflow_paths_unique += path + "\n" | |
36 with open(file_path, "w") as workflows_file: | |
37 workflows_file.write(workflow_paths_unique) | |
38 | |
39 | |
40 def load_saved_model(model_config, model_weights): | |
41 """ | |
42 Load the saved trained model using the saved network and its weights | |
43 """ | |
44 # load the network | |
45 loaded_model = model_from_json(model_config) | |
46 # load the saved weights into the model | |
47 loaded_model.set_weights(model_weights) | |
48 return loaded_model | |
49 | |
50 | |
51 def format_tool_id(tool_link): | 19 def format_tool_id(tool_link): |
52 """ | 20 """ |
53 Extract tool id from tool link | 21 Extract tool id from tool link |
54 """ | 22 """ |
55 tool_id_split = tool_link.split("/") | 23 tool_id_split = tool_link.split("/") |
56 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link | 24 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link |
57 return tool_id | 25 return tool_id |
58 | |
59 | |
60 def get_HDF5(hf, d_key): | |
61 """ | |
62 Read h5 file to get train and test data | |
63 """ | |
64 return hf.get(d_key).value | |
65 | |
66 | |
67 def save_HDF5(hf_file, d_key, data, d_type=""): | |
68 """ | |
69 Save datasets as h5 file | |
70 """ | |
71 if (d_type == 'json'): | |
72 data = json.dumps(data) | |
73 hf_file.create_dataset(d_key, data=data) | |
74 | 26 |
75 | 27 |
76 def set_trained_model(dump_file, model_values): | 28 def set_trained_model(dump_file, model_values): |
77 """ | 29 """ |
78 Create an h5 file with the trained weights and associated dicts | 30 Create an h5 file with the trained weights and associated dicts |
93 else: | 45 else: |
94 hf_file.create_dataset(key, data=json.dumps(value)) | 46 hf_file.create_dataset(key, data=json.dumps(value)) |
95 hf_file.close() | 47 hf_file.close() |
96 | 48 |
97 | 49 |
98 def remove_file(file_path): | |
99 if os.path.exists(file_path): | |
100 os.remove(file_path) | |
101 | |
102 | |
103 def extract_configuration(config_object): | |
104 config_loss = dict() | |
105 for index, item in enumerate(config_object): | |
106 config_loss[index] = list() | |
107 d_config = dict() | |
108 d_config['loss'] = item['result']['loss'] | |
109 d_config['params_config'] = item['misc']['vals'] | |
110 config_loss[index].append(d_config) | |
111 return config_loss | |
112 | |
113 | |
114 def get_best_parameters(mdl_dict): | |
115 """ | |
116 Get param values (defaults as well) | |
117 """ | |
118 lr = float(mdl_dict.get("learning_rate", "0.001")) | |
119 embedding_size = int(mdl_dict.get("embedding_size", "512")) | |
120 dropout = float(mdl_dict.get("dropout", "0.2")) | |
121 recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2")) | |
122 spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2")) | |
123 units = int(mdl_dict.get("units", "512")) | |
124 batch_size = int(mdl_dict.get("batch_size", "512")) | |
125 activation_recurrent = mdl_dict.get("activation_recurrent", "elu") | |
126 activation_output = mdl_dict.get("activation_output", "sigmoid") | |
127 | |
128 return { | |
129 "lr": lr, | |
130 "embedding_size": embedding_size, | |
131 "dropout": dropout, | |
132 "recurrent_dropout": recurrent_dropout, | |
133 "spatial_dropout": spatial_dropout, | |
134 "units": units, | |
135 "batch_size": batch_size, | |
136 "activation_recurrent": activation_recurrent, | |
137 "activation_output": activation_output, | |
138 } | |
139 | |
140 | |
141 def weighted_loss(class_weights): | 50 def weighted_loss(class_weights): |
142 """ | 51 """ |
143 Create a weighted loss function. Penalise the misclassification | 52 Create a weighted loss function. Penalise the misclassification |
144 of classes more with the higher usage | 53 of classes more with the higher usage |
145 """ | 54 """ |
146 weight_values = list(class_weights.values()) | 55 weight_values = list(class_weights.values()) |
56 weight_values.extend(weight_values) | |
147 | 57 |
148 def weighted_binary_crossentropy(y_true, y_pred): | 58 def weighted_binary_crossentropy(y_true, y_pred): |
149 # add another dimension to compute dot product | 59 # add another dimension to compute dot product |
150 expanded_weights = K.expand_dims(weight_values, axis=-1) | 60 expanded_weights = K.expand_dims(weight_values, axis=-1) |
151 return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights) | 61 return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights) |
152 return weighted_binary_crossentropy | 62 return weighted_binary_crossentropy |
153 | 63 |
154 | 64 |
155 def set_recurrent_network(mdl_dict, reverse_dictionary, class_weights): | 65 def balanced_sample_generator(train_data, train_labels, batch_size, l_tool_tr_samples): |
156 """ | 66 while True: |
157 Create a RNN network and set its parameters | 67 dimension = train_data.shape[1] |
158 """ | 68 n_classes = train_labels.shape[1] |
159 dimensions = len(reverse_dictionary) + 1 | 69 tool_ids = list(l_tool_tr_samples.keys()) |
160 model_params = get_best_parameters(mdl_dict) | 70 generator_batch_data = np.zeros([batch_size, dimension]) |
161 | 71 generator_batch_labels = np.zeros([batch_size, n_classes]) |
162 # define the architecture of the neural network | 72 for i in range(batch_size): |
163 model = Sequential() | 73 random_toolid_index = random.sample(range(0, len(tool_ids)), 1)[0] |
164 model.add(Embedding(dimensions, model_params["embedding_size"], mask_zero=True)) | 74 random_toolid = tool_ids[random_toolid_index] |
165 model.add(SpatialDropout1D(model_params["spatial_dropout"])) | 75 sample_indices = l_tool_tr_samples[str(random_toolid)] |
166 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=True)) | 76 random_index = random.sample(range(0, len(sample_indices)), 1)[0] |
167 model.add(Dropout(model_params["dropout"])) | 77 random_tr_index = sample_indices[random_index] |
168 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False)) | 78 generator_batch_data[i] = train_data[random_tr_index] |
169 model.add(Dropout(model_params["dropout"])) | 79 generator_batch_labels[i] = train_labels[random_tr_index] |
170 model.add(Dense(dimensions, activation=model_params["activation_output"])) | 80 yield generator_batch_data, generator_batch_labels |
171 optimizer = RMSprop(lr=model_params["lr"]) | 81 |
172 model.compile(loss=weighted_loss(class_weights), optimizer=optimizer) | 82 |
173 return model, model_params | 83 def compute_precision(model, x, y, reverse_data_dictionary, usage_scores, actual_classes_pos, topk, standard_conn, last_tool_id, lowest_tool_ids): |
174 | |
175 | |
176 def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk): | |
177 """ | 84 """ |
178 Compute absolute and compatible precision | 85 Compute absolute and compatible precision |
179 """ | 86 """ |
180 absolute_precision = 0.0 | 87 pred_t_name = "" |
88 top_precision = 0.0 | |
89 mean_usage = 0.0 | |
90 usage_wt_score = list() | |
91 pub_precision = 0.0 | |
92 lowest_pub_prec = 0.0 | |
93 lowest_norm_prec = 0.0 | |
94 pub_tools = list() | |
95 actual_next_tool_names = list() | |
181 test_sample = np.reshape(x, (1, len(x))) | 96 test_sample = np.reshape(x, (1, len(x))) |
182 | 97 |
183 # predict next tools for a test path | 98 # predict next tools for a test path |
184 prediction = model.predict(test_sample, verbose=0) | 99 prediction = model.predict(test_sample, verbose=0) |
185 | 100 |
101 # divide the predicted vector into two halves - one for published and | |
102 # another for normal workflows | |
186 nw_dimension = prediction.shape[1] | 103 nw_dimension = prediction.shape[1] |
187 | 104 half_len = int(nw_dimension / 2) |
188 # remove the 0th position as there is no tool at this index | 105 |
106 # predict tools | |
189 prediction = np.reshape(prediction, (nw_dimension,)) | 107 prediction = np.reshape(prediction, (nw_dimension,)) |
190 | 108 # get predictions of tools from published workflows |
191 prediction_pos = np.argsort(prediction, axis=-1) | 109 standard_pred = prediction[:half_len] |
192 topk_prediction_pos = prediction_pos[-topk:] | 110 # get predictions of tools from normal workflows |
193 | 111 normal_pred = prediction[half_len:] |
194 # remove the wrong tool position from the predicted list of tool positions | 112 |
195 topk_prediction_pos = [x for x in topk_prediction_pos if x > 0] | 113 standard_prediction_pos = np.argsort(standard_pred, axis=-1) |
196 | 114 standard_topk_prediction_pos = standard_prediction_pos[-topk] |
197 # read tool names using reverse dictionary | 115 |
198 actual_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in actual_classes_pos] | 116 normal_prediction_pos = np.argsort(normal_pred, axis=-1) |
199 top_predicted_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in topk_prediction_pos] | 117 normal_topk_prediction_pos = normal_prediction_pos[-topk] |
200 | 118 |
201 # compute the class weights of predicted tools | 119 # get true tools names |
202 mean_usg_score = 0 | 120 for a_t_pos in actual_classes_pos: |
203 usg_wt_scores = list() | 121 if a_t_pos > half_len: |
204 for t_id in topk_prediction_pos: | 122 t_name = reverse_data_dictionary[int(a_t_pos - half_len)] |
205 t_name = reverse_data_dictionary[int(t_id)] | 123 else: |
206 if t_id in usage_scores and t_name in actual_next_tool_names: | 124 t_name = reverse_data_dictionary[int(a_t_pos)] |
207 usg_wt_scores.append(np.log(usage_scores[t_id] + 1.0)) | 125 actual_next_tool_names.append(t_name) |
208 if len(usg_wt_scores) > 0: | 126 last_tool_name = reverse_data_dictionary[x[-1]] |
209 mean_usg_score = np.sum(usg_wt_scores) / float(topk) | 127 # compute scores for published recommendations |
210 false_positives = [tool_name for tool_name in top_predicted_next_tool_names if tool_name not in actual_next_tool_names] | 128 if standard_topk_prediction_pos in reverse_data_dictionary: |
211 absolute_precision = 1 - (len(false_positives) / float(topk)) | 129 pred_t_name = reverse_data_dictionary[int(standard_topk_prediction_pos)] |
212 return mean_usg_score, absolute_precision | 130 if last_tool_name in standard_conn: |
213 | 131 pub_tools = standard_conn[last_tool_name] |
214 | 132 if pred_t_name in pub_tools: |
215 def verify_model(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, topk_list=[1, 2, 3]): | 133 pub_precision = 1.0 |
134 if last_tool_id in lowest_tool_ids: | |
135 lowest_pub_prec = 1.0 | |
136 if standard_topk_prediction_pos in usage_scores: | |
137 usage_wt_score.append(np.log(usage_scores[standard_topk_prediction_pos] + 1.0)) | |
138 # compute scores for normal recommendations | |
139 if normal_topk_prediction_pos in reverse_data_dictionary: | |
140 pred_t_name = reverse_data_dictionary[int(normal_topk_prediction_pos)] | |
141 if pred_t_name in actual_next_tool_names: | |
142 if normal_topk_prediction_pos in usage_scores: | |
143 usage_wt_score.append(np.log(usage_scores[normal_topk_prediction_pos] + 1.0)) | |
144 top_precision = 1.0 | |
145 if last_tool_id in lowest_tool_ids: | |
146 lowest_norm_prec = 1.0 | |
147 if len(usage_wt_score) > 0: | |
148 mean_usage = np.mean(usage_wt_score) | |
149 return mean_usage, top_precision, pub_precision, lowest_pub_prec, lowest_norm_prec | |
150 | |
151 | |
152 def get_lowest_tools(l_tool_freq, fraction=0.25): | |
153 l_tool_freq = dict(sorted(l_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) | |
154 tool_ids = list(l_tool_freq.keys()) | |
155 lowest_ids = tool_ids[-int(len(tool_ids) * fraction):] | |
156 return lowest_ids | |
157 | |
158 | |
159 def verify_model(model, x, y, reverse_data_dictionary, usage_scores, standard_conn, lowest_tool_ids, topk_list=[1, 2, 3]): | |
216 """ | 160 """ |
217 Verify the model on test data | 161 Verify the model on test data |
218 """ | 162 """ |
219 print("Evaluating performance on test data...") | 163 print("Evaluating performance on test data...") |
220 print("Test data size: %d" % len(y)) | 164 print("Test data size: %d" % len(y)) |
221 size = y.shape[0] | 165 size = y.shape[0] |
222 precision = np.zeros([len(y), len(topk_list)]) | 166 precision = np.zeros([len(y), len(topk_list)]) |
223 usage_weights = np.zeros([len(y), len(topk_list)]) | 167 usage_weights = np.zeros([len(y), len(topk_list)]) |
168 epo_pub_prec = np.zeros([len(y), len(topk_list)]) | |
169 epo_lowest_tools_pub_prec = list() | |
170 epo_lowest_tools_norm_prec = list() | |
171 | |
224 # loop over all the test samples and find prediction precision | 172 # loop over all the test samples and find prediction precision |
225 for i in range(size): | 173 for i in range(size): |
174 lowest_pub_topk = list() | |
175 lowest_norm_topk = list() | |
226 actual_classes_pos = np.where(y[i] > 0)[0] | 176 actual_classes_pos = np.where(y[i] > 0)[0] |
177 test_sample = x[i, :] | |
178 last_tool_id = str(int(test_sample[-1])) | |
227 for index, abs_topk in enumerate(topk_list): | 179 for index, abs_topk in enumerate(topk_list): |
228 abs_mean_usg_score, absolute_precision = compute_precision(model, x[i, :], y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, abs_topk) | 180 usg_wt_score, absolute_precision, pub_prec, lowest_p_prec, lowest_n_prec = compute_precision(model, test_sample, y, reverse_data_dictionary, usage_scores, actual_classes_pos, abs_topk, standard_conn, last_tool_id, lowest_tool_ids) |
229 precision[i][index] = absolute_precision | 181 precision[i][index] = absolute_precision |
230 usage_weights[i][index] = abs_mean_usg_score | 182 usage_weights[i][index] = usg_wt_score |
183 epo_pub_prec[i][index] = pub_prec | |
184 if last_tool_id in lowest_tool_ids: | |
185 lowest_pub_topk.append(lowest_p_prec) | |
186 lowest_norm_topk.append(lowest_n_prec) | |
187 if last_tool_id in lowest_tool_ids: | |
188 epo_lowest_tools_pub_prec.append(lowest_pub_topk) | |
189 epo_lowest_tools_norm_prec.append(lowest_norm_topk) | |
231 mean_precision = np.mean(precision, axis=0) | 190 mean_precision = np.mean(precision, axis=0) |
232 mean_usage = np.mean(usage_weights, axis=0) | 191 mean_usage = np.mean(usage_weights, axis=0) |
233 return mean_precision, mean_usage | 192 mean_pub_prec = np.mean(epo_pub_prec, axis=0) |
234 | 193 mean_lowest_pub_prec = np.mean(epo_lowest_tools_pub_prec, axis=0) |
235 | 194 mean_lowest_norm_prec = np.mean(epo_lowest_tools_norm_prec, axis=0) |
236 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights): | 195 return mean_usage, mean_precision, mean_pub_prec, mean_lowest_pub_prec, mean_lowest_norm_prec, len(epo_lowest_tools_pub_prec) |
196 | |
197 | |
198 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections): | |
237 # save files | 199 # save files |
238 trained_model = results["model"] | 200 trained_model = results["model"] |
239 best_model_parameters = results["best_parameters"] | 201 best_model_parameters = results["best_parameters"] |
240 model_config = trained_model.to_json() | 202 model_config = trained_model.to_json() |
241 model_weights = trained_model.get_weights() | 203 model_weights = trained_model.get_weights() |
242 | |
243 model_values = { | 204 model_values = { |
244 'data_dictionary': data_dictionary, | 205 'data_dictionary': data_dictionary, |
245 'model_config': model_config, | 206 'model_config': model_config, |
246 'best_parameters': best_model_parameters, | 207 'best_parameters': best_model_parameters, |
247 'model_weights': model_weights, | 208 'model_weights': model_weights, |
248 "compatible_tools": compatible_next_tools, | 209 "compatible_tools": compatible_next_tools, |
249 "class_weights": class_weights | 210 "class_weights": class_weights, |
211 "standard_connections": standard_connections | |
250 } | 212 } |
251 set_trained_model(trained_model_path, model_values) | 213 set_trained_model(trained_model_path, model_values) |