comparison utils.py @ 2:50753817983a draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
author bgruening
date Sat, 09 May 2020 09:38:04 +0000
parents 22ebbac136c7
children 98bc44d17561
comparison
equal deleted inserted replaced
1:275e98795e99 2:50753817983a
1 import os 1 import os
2 import numpy as np 2 import numpy as np
3 import json 3 import json
4 import h5py 4 import h5py
5 5 import random
6 from keras.models import model_from_json, Sequential 6
7 from keras.layers import Dense, GRU, Dropout
8 from keras.layers.embeddings import Embedding
9 from keras.layers.core import SpatialDropout1D
10 from keras.optimizers import RMSprop
11 from keras import backend as K 7 from keras import backend as K
12 8
13 9
14 def read_file(file_path): 10 def read_file(file_path):
15 """ 11 """
18 with open(file_path, "r") as json_file: 14 with open(file_path, "r") as json_file:
19 file_content = json.loads(json_file.read()) 15 file_content = json.loads(json_file.read())
20 return file_content 16 return file_content
21 17
22 18
23 def write_file(file_path, content):
24 """
25 Write a file
26 """
27 remove_file(file_path)
28 with open(file_path, "w") as json_file:
29 json_file.write(json.dumps(content))
30
31
32 def save_processed_workflows(file_path, unique_paths):
33 workflow_paths_unique = ""
34 for path in unique_paths:
35 workflow_paths_unique += path + "\n"
36 with open(file_path, "w") as workflows_file:
37 workflows_file.write(workflow_paths_unique)
38
39
40 def load_saved_model(model_config, model_weights):
41 """
42 Load the saved trained model using the saved network and its weights
43 """
44 # load the network
45 loaded_model = model_from_json(model_config)
46 # load the saved weights into the model
47 loaded_model.set_weights(model_weights)
48 return loaded_model
49
50
51 def format_tool_id(tool_link): 19 def format_tool_id(tool_link):
52 """ 20 """
53 Extract tool id from tool link 21 Extract tool id from tool link
54 """ 22 """
55 tool_id_split = tool_link.split("/") 23 tool_id_split = tool_link.split("/")
56 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link 24 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link
57 return tool_id 25 return tool_id
58
59
60 def get_HDF5(hf, d_key):
61 """
62 Read h5 file to get train and test data
63 """
64 return hf.get(d_key).value
65
66
67 def save_HDF5(hf_file, d_key, data, d_type=""):
68 """
69 Save datasets as h5 file
70 """
71 if (d_type == 'json'):
72 data = json.dumps(data)
73 hf_file.create_dataset(d_key, data=data)
74 26
75 27
76 def set_trained_model(dump_file, model_values): 28 def set_trained_model(dump_file, model_values):
77 """ 29 """
78 Create an h5 file with the trained weights and associated dicts 30 Create an h5 file with the trained weights and associated dicts
93 else: 45 else:
94 hf_file.create_dataset(key, data=json.dumps(value)) 46 hf_file.create_dataset(key, data=json.dumps(value))
95 hf_file.close() 47 hf_file.close()
96 48
97 49
98 def remove_file(file_path):
99 if os.path.exists(file_path):
100 os.remove(file_path)
101
102
103 def extract_configuration(config_object):
104 config_loss = dict()
105 for index, item in enumerate(config_object):
106 config_loss[index] = list()
107 d_config = dict()
108 d_config['loss'] = item['result']['loss']
109 d_config['params_config'] = item['misc']['vals']
110 config_loss[index].append(d_config)
111 return config_loss
112
113
114 def get_best_parameters(mdl_dict):
115 """
116 Get param values (defaults as well)
117 """
118 lr = float(mdl_dict.get("learning_rate", "0.001"))
119 embedding_size = int(mdl_dict.get("embedding_size", "512"))
120 dropout = float(mdl_dict.get("dropout", "0.2"))
121 recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2"))
122 spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2"))
123 units = int(mdl_dict.get("units", "512"))
124 batch_size = int(mdl_dict.get("batch_size", "512"))
125 activation_recurrent = mdl_dict.get("activation_recurrent", "elu")
126 activation_output = mdl_dict.get("activation_output", "sigmoid")
127
128 return {
129 "lr": lr,
130 "embedding_size": embedding_size,
131 "dropout": dropout,
132 "recurrent_dropout": recurrent_dropout,
133 "spatial_dropout": spatial_dropout,
134 "units": units,
135 "batch_size": batch_size,
136 "activation_recurrent": activation_recurrent,
137 "activation_output": activation_output,
138 }
139
140
141 def weighted_loss(class_weights): 50 def weighted_loss(class_weights):
142 """ 51 """
143 Create a weighted loss function. Penalise the misclassification 52 Create a weighted loss function. Penalise the misclassification
144 of classes more with the higher usage 53 of classes more with the higher usage
145 """ 54 """
146 weight_values = list(class_weights.values()) 55 weight_values = list(class_weights.values())
56 weight_values.extend(weight_values)
147 57
148 def weighted_binary_crossentropy(y_true, y_pred): 58 def weighted_binary_crossentropy(y_true, y_pred):
149 # add another dimension to compute dot product 59 # add another dimension to compute dot product
150 expanded_weights = K.expand_dims(weight_values, axis=-1) 60 expanded_weights = K.expand_dims(weight_values, axis=-1)
151 return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights) 61 return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights)
152 return weighted_binary_crossentropy 62 return weighted_binary_crossentropy
153 63
154 64
155 def set_recurrent_network(mdl_dict, reverse_dictionary, class_weights): 65 def balanced_sample_generator(train_data, train_labels, batch_size, l_tool_tr_samples):
156 """ 66 while True:
157 Create a RNN network and set its parameters 67 dimension = train_data.shape[1]
158 """ 68 n_classes = train_labels.shape[1]
159 dimensions = len(reverse_dictionary) + 1 69 tool_ids = list(l_tool_tr_samples.keys())
160 model_params = get_best_parameters(mdl_dict) 70 generator_batch_data = np.zeros([batch_size, dimension])
161 71 generator_batch_labels = np.zeros([batch_size, n_classes])
162 # define the architecture of the neural network 72 for i in range(batch_size):
163 model = Sequential() 73 random_toolid_index = random.sample(range(0, len(tool_ids)), 1)[0]
164 model.add(Embedding(dimensions, model_params["embedding_size"], mask_zero=True)) 74 random_toolid = tool_ids[random_toolid_index]
165 model.add(SpatialDropout1D(model_params["spatial_dropout"])) 75 sample_indices = l_tool_tr_samples[str(random_toolid)]
166 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=True)) 76 random_index = random.sample(range(0, len(sample_indices)), 1)[0]
167 model.add(Dropout(model_params["dropout"])) 77 random_tr_index = sample_indices[random_index]
168 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False)) 78 generator_batch_data[i] = train_data[random_tr_index]
169 model.add(Dropout(model_params["dropout"])) 79 generator_batch_labels[i] = train_labels[random_tr_index]
170 model.add(Dense(dimensions, activation=model_params["activation_output"])) 80 yield generator_batch_data, generator_batch_labels
171 optimizer = RMSprop(lr=model_params["lr"]) 81
172 model.compile(loss=weighted_loss(class_weights), optimizer=optimizer) 82
173 return model, model_params 83 def compute_precision(model, x, y, reverse_data_dictionary, usage_scores, actual_classes_pos, topk, standard_conn, last_tool_id, lowest_tool_ids):
174
175
176 def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk):
177 """ 84 """
178 Compute absolute and compatible precision 85 Compute absolute and compatible precision
179 """ 86 """
180 absolute_precision = 0.0 87 pred_t_name = ""
88 top_precision = 0.0
89 mean_usage = 0.0
90 usage_wt_score = list()
91 pub_precision = 0.0
92 lowest_pub_prec = 0.0
93 lowest_norm_prec = 0.0
94 pub_tools = list()
95 actual_next_tool_names = list()
181 test_sample = np.reshape(x, (1, len(x))) 96 test_sample = np.reshape(x, (1, len(x)))
182 97
183 # predict next tools for a test path 98 # predict next tools for a test path
184 prediction = model.predict(test_sample, verbose=0) 99 prediction = model.predict(test_sample, verbose=0)
185 100
101 # divide the predicted vector into two halves - one for published and
102 # another for normal workflows
186 nw_dimension = prediction.shape[1] 103 nw_dimension = prediction.shape[1]
187 104 half_len = int(nw_dimension / 2)
188 # remove the 0th position as there is no tool at this index 105
106 # predict tools
189 prediction = np.reshape(prediction, (nw_dimension,)) 107 prediction = np.reshape(prediction, (nw_dimension,))
190 108 # get predictions of tools from published workflows
191 prediction_pos = np.argsort(prediction, axis=-1) 109 standard_pred = prediction[:half_len]
192 topk_prediction_pos = prediction_pos[-topk:] 110 # get predictions of tools from normal workflows
193 111 normal_pred = prediction[half_len:]
194 # remove the wrong tool position from the predicted list of tool positions 112
195 topk_prediction_pos = [x for x in topk_prediction_pos if x > 0] 113 standard_prediction_pos = np.argsort(standard_pred, axis=-1)
196 114 standard_topk_prediction_pos = standard_prediction_pos[-topk]
197 # read tool names using reverse dictionary 115
198 actual_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in actual_classes_pos] 116 normal_prediction_pos = np.argsort(normal_pred, axis=-1)
199 top_predicted_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in topk_prediction_pos] 117 normal_topk_prediction_pos = normal_prediction_pos[-topk]
200 118
201 # compute the class weights of predicted tools 119 # get true tools names
202 mean_usg_score = 0 120 for a_t_pos in actual_classes_pos:
203 usg_wt_scores = list() 121 if a_t_pos > half_len:
204 for t_id in topk_prediction_pos: 122 t_name = reverse_data_dictionary[int(a_t_pos - half_len)]
205 t_name = reverse_data_dictionary[int(t_id)] 123 else:
206 if t_id in usage_scores and t_name in actual_next_tool_names: 124 t_name = reverse_data_dictionary[int(a_t_pos)]
207 usg_wt_scores.append(np.log(usage_scores[t_id] + 1.0)) 125 actual_next_tool_names.append(t_name)
208 if len(usg_wt_scores) > 0: 126 last_tool_name = reverse_data_dictionary[x[-1]]
209 mean_usg_score = np.sum(usg_wt_scores) / float(topk) 127 # compute scores for published recommendations
210 false_positives = [tool_name for tool_name in top_predicted_next_tool_names if tool_name not in actual_next_tool_names] 128 if standard_topk_prediction_pos in reverse_data_dictionary:
211 absolute_precision = 1 - (len(false_positives) / float(topk)) 129 pred_t_name = reverse_data_dictionary[int(standard_topk_prediction_pos)]
212 return mean_usg_score, absolute_precision 130 if last_tool_name in standard_conn:
213 131 pub_tools = standard_conn[last_tool_name]
214 132 if pred_t_name in pub_tools:
215 def verify_model(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, topk_list=[1, 2, 3]): 133 pub_precision = 1.0
134 if last_tool_id in lowest_tool_ids:
135 lowest_pub_prec = 1.0
136 if standard_topk_prediction_pos in usage_scores:
137 usage_wt_score.append(np.log(usage_scores[standard_topk_prediction_pos] + 1.0))
138 # compute scores for normal recommendations
139 if normal_topk_prediction_pos in reverse_data_dictionary:
140 pred_t_name = reverse_data_dictionary[int(normal_topk_prediction_pos)]
141 if pred_t_name in actual_next_tool_names:
142 if normal_topk_prediction_pos in usage_scores:
143 usage_wt_score.append(np.log(usage_scores[normal_topk_prediction_pos] + 1.0))
144 top_precision = 1.0
145 if last_tool_id in lowest_tool_ids:
146 lowest_norm_prec = 1.0
147 if len(usage_wt_score) > 0:
148 mean_usage = np.mean(usage_wt_score)
149 return mean_usage, top_precision, pub_precision, lowest_pub_prec, lowest_norm_prec
150
151
152 def get_lowest_tools(l_tool_freq, fraction=0.25):
153 l_tool_freq = dict(sorted(l_tool_freq.items(), key=lambda kv: kv[1], reverse=True))
154 tool_ids = list(l_tool_freq.keys())
155 lowest_ids = tool_ids[-int(len(tool_ids) * fraction):]
156 return lowest_ids
157
158
159 def verify_model(model, x, y, reverse_data_dictionary, usage_scores, standard_conn, lowest_tool_ids, topk_list=[1, 2, 3]):
216 """ 160 """
217 Verify the model on test data 161 Verify the model on test data
218 """ 162 """
219 print("Evaluating performance on test data...") 163 print("Evaluating performance on test data...")
220 print("Test data size: %d" % len(y)) 164 print("Test data size: %d" % len(y))
221 size = y.shape[0] 165 size = y.shape[0]
222 precision = np.zeros([len(y), len(topk_list)]) 166 precision = np.zeros([len(y), len(topk_list)])
223 usage_weights = np.zeros([len(y), len(topk_list)]) 167 usage_weights = np.zeros([len(y), len(topk_list)])
168 epo_pub_prec = np.zeros([len(y), len(topk_list)])
169 epo_lowest_tools_pub_prec = list()
170 epo_lowest_tools_norm_prec = list()
171
224 # loop over all the test samples and find prediction precision 172 # loop over all the test samples and find prediction precision
225 for i in range(size): 173 for i in range(size):
174 lowest_pub_topk = list()
175 lowest_norm_topk = list()
226 actual_classes_pos = np.where(y[i] > 0)[0] 176 actual_classes_pos = np.where(y[i] > 0)[0]
177 test_sample = x[i, :]
178 last_tool_id = str(int(test_sample[-1]))
227 for index, abs_topk in enumerate(topk_list): 179 for index, abs_topk in enumerate(topk_list):
228 abs_mean_usg_score, absolute_precision = compute_precision(model, x[i, :], y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, abs_topk) 180 usg_wt_score, absolute_precision, pub_prec, lowest_p_prec, lowest_n_prec = compute_precision(model, test_sample, y, reverse_data_dictionary, usage_scores, actual_classes_pos, abs_topk, standard_conn, last_tool_id, lowest_tool_ids)
229 precision[i][index] = absolute_precision 181 precision[i][index] = absolute_precision
230 usage_weights[i][index] = abs_mean_usg_score 182 usage_weights[i][index] = usg_wt_score
183 epo_pub_prec[i][index] = pub_prec
184 if last_tool_id in lowest_tool_ids:
185 lowest_pub_topk.append(lowest_p_prec)
186 lowest_norm_topk.append(lowest_n_prec)
187 if last_tool_id in lowest_tool_ids:
188 epo_lowest_tools_pub_prec.append(lowest_pub_topk)
189 epo_lowest_tools_norm_prec.append(lowest_norm_topk)
231 mean_precision = np.mean(precision, axis=0) 190 mean_precision = np.mean(precision, axis=0)
232 mean_usage = np.mean(usage_weights, axis=0) 191 mean_usage = np.mean(usage_weights, axis=0)
233 return mean_precision, mean_usage 192 mean_pub_prec = np.mean(epo_pub_prec, axis=0)
234 193 mean_lowest_pub_prec = np.mean(epo_lowest_tools_pub_prec, axis=0)
235 194 mean_lowest_norm_prec = np.mean(epo_lowest_tools_norm_prec, axis=0)
236 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights): 195 return mean_usage, mean_precision, mean_pub_prec, mean_lowest_pub_prec, mean_lowest_norm_prec, len(epo_lowest_tools_pub_prec)
196
197
198 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights, standard_connections):
237 # save files 199 # save files
238 trained_model = results["model"] 200 trained_model = results["model"]
239 best_model_parameters = results["best_parameters"] 201 best_model_parameters = results["best_parameters"]
240 model_config = trained_model.to_json() 202 model_config = trained_model.to_json()
241 model_weights = trained_model.get_weights() 203 model_weights = trained_model.get_weights()
242
243 model_values = { 204 model_values = {
244 'data_dictionary': data_dictionary, 205 'data_dictionary': data_dictionary,
245 'model_config': model_config, 206 'model_config': model_config,
246 'best_parameters': best_model_parameters, 207 'best_parameters': best_model_parameters,
247 'model_weights': model_weights, 208 'model_weights': model_weights,
248 "compatible_tools": compatible_next_tools, 209 "compatible_tools": compatible_next_tools,
249 "class_weights": class_weights 210 "class_weights": class_weights,
211 "standard_connections": standard_connections
250 } 212 }
251 set_trained_model(trained_model_path, model_values) 213 set_trained_model(trained_model_path, model_values)