Mercurial > repos > bgruening > create_tool_recommendation_model
comparison prepare_data.py @ 4:f0da532be419 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
| author | bgruening |
|---|---|
| date | Fri, 06 May 2022 09:04:44 +0000 |
| parents | 98bc44d17561 |
| children | 9ec705bd11cb |
comparison
equal
deleted
inserted
replaced
| 3:98bc44d17561 | 4:f0da532be419 |
|---|---|
| 2 Prepare the workflow paths to be used by downstream | 2 Prepare the workflow paths to be used by downstream |
| 3 machine learning algorithm. The paths are divided | 3 machine learning algorithm. The paths are divided |
| 4 into the test and training sets | 4 into the test and training sets |
| 5 """ | 5 """ |
| 6 | 6 |
| 7 import collections | |
| 7 import os | 8 import os |
| 8 import collections | 9 import random |
| 10 | |
| 9 import numpy as np | 11 import numpy as np |
| 10 import random | |
| 11 | |
| 12 import predict_tool_usage | 12 import predict_tool_usage |
| 13 | 13 |
| 14 main_path = os.getcwd() | 14 main_path = os.getcwd() |
| 15 | 15 |
| 16 | 16 |
| 17 class PrepareData: | 17 class PrepareData: |
| 18 | |
| 19 def __init__(self, max_seq_length, test_data_share): | 18 def __init__(self, max_seq_length, test_data_share): |
| 20 """ Init method. """ | 19 """ Init method. """ |
| 21 self.max_tool_sequence_len = max_seq_length | 20 self.max_tool_sequence_len = max_seq_length |
| 22 self.test_share = test_data_share | 21 self.test_share = test_data_share |
| 23 | 22 |
| 25 """ | 24 """ |
| 26 Get all the tools and complete set of individual paths for each workflow | 25 Get all the tools and complete set of individual paths for each workflow |
| 27 """ | 26 """ |
| 28 tokens = list() | 27 tokens = list() |
| 29 raw_paths = workflow_paths | 28 raw_paths = workflow_paths |
| 30 raw_paths = [x.replace("\n", '') for x in raw_paths] | 29 raw_paths = [x.replace("\n", "") for x in raw_paths] |
| 31 for item in raw_paths: | 30 for item in raw_paths: |
| 32 split_items = item.split(",") | 31 split_items = item.split(",") |
| 33 for token in split_items: | 32 for token in split_items: |
| 34 if token is not "": | 33 if token != "": |
| 35 tokens.append(token) | 34 tokens.append(token) |
| 36 tokens = list(set(tokens)) | 35 tokens = list(set(tokens)) |
| 37 tokens = np.array(tokens) | 36 tokens = np.array(tokens) |
| 38 tokens = np.reshape(tokens, [-1, ]) | 37 tokens = np.reshape( |
| 38 tokens, | |
| 39 [ | |
| 40 -1, | |
| 41 ], | |
| 42 ) | |
| 39 return tokens, raw_paths | 43 return tokens, raw_paths |
| 40 | 44 |
| 41 def create_new_dict(self, new_data_dict): | 45 def create_new_dict(self, new_data_dict): |
| 42 """ | 46 """ |
| 43 Create new data dictionary | 47 Create new data dictionary |
| 58 """ | 62 """ |
| 59 count = collections.Counter(words).most_common() | 63 count = collections.Counter(words).most_common() |
| 60 dictionary = dict() | 64 dictionary = dict() |
| 61 for word, _ in count: | 65 for word, _ in count: |
| 62 dictionary[word] = len(dictionary) + 1 | 66 dictionary[word] = len(dictionary) + 1 |
| 63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) | 67 word = word.strip() |
| 68 dictionary, reverse_dictionary = self.assemble_dictionary( | |
| 69 dictionary, old_data_dictionary | |
| 70 ) | |
| 64 return dictionary, reverse_dictionary | 71 return dictionary, reverse_dictionary |
| 65 | 72 |
| 66 def decompose_paths(self, paths, dictionary): | 73 def decompose_paths(self, paths, dictionary): |
| 67 """ | 74 """ |
| 68 Decompose the paths to variable length sub-paths keeping the first tool fixed | 75 Decompose the paths to variable length sub-paths keeping the first tool fixed |
| 72 tools = item.split(",") | 79 tools = item.split(",") |
| 73 len_tools = len(tools) | 80 len_tools = len(tools) |
| 74 if len_tools <= self.max_tool_sequence_len: | 81 if len_tools <= self.max_tool_sequence_len: |
| 75 for window in range(1, len_tools): | 82 for window in range(1, len_tools): |
| 76 sequence = tools[0: window + 1] | 83 sequence = tools[0: window + 1] |
| 77 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] | 84 tools_pos = [ |
| 85 str(dictionary[str(tool_item)]) for tool_item in sequence | |
| 86 ] | |
| 78 if len(tools_pos) > 1: | 87 if len(tools_pos) > 1: |
| 79 sub_paths_pos.append(",".join(tools_pos)) | 88 sub_paths_pos.append(",".join(tools_pos)) |
| 80 sub_paths_pos = list(set(sub_paths_pos)) | 89 sub_paths_pos = list(set(sub_paths_pos)) |
| 81 return sub_paths_pos | 90 return sub_paths_pos |
| 82 | 91 |
| 83 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): | 92 def prepare_paths_labels_dictionary( |
| 93 self, dictionary, reverse_dictionary, paths, compatible_next_tools | |
| 94 ): | |
| 84 """ | 95 """ |
| 85 Create a dictionary of sequences with their labels for training and test paths | 96 Create a dictionary of sequences with their labels for training and test paths |
| 86 """ | 97 """ |
| 87 paths_labels = dict() | 98 paths_labels = dict() |
| 88 random.shuffle(paths) | 99 random.shuffle(paths) |
| 89 for item in paths: | 100 for item in paths: |
| 90 if item and item not in "": | 101 if item and item not in "": |
| 91 tools = item.split(",") | 102 tools = item.split(",") |
| 92 label = tools[-1] | 103 label = tools[-1] |
| 93 train_tools = tools[:len(tools) - 1] | 104 train_tools = tools[: len(tools) - 1] |
| 94 last_but_one_name = reverse_dictionary[int(train_tools[-1])] | 105 last_but_one_name = reverse_dictionary[int(train_tools[-1])] |
| 95 try: | 106 try: |
| 96 compatible_tools = compatible_next_tools[last_but_one_name].split(",") | 107 compatible_tools = compatible_next_tools[last_but_one_name].split( |
| 108 "," | |
| 109 ) | |
| 97 except Exception: | 110 except Exception: |
| 98 continue | 111 continue |
| 99 if len(compatible_tools) > 0: | 112 if len(compatible_tools) > 0: |
| 100 compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools] | 113 compatible_tools_ids = [ |
| 114 str(dictionary[x]) for x in compatible_tools | |
| 115 ] | |
| 101 compatible_tools_ids.append(label) | 116 compatible_tools_ids.append(label) |
| 102 composite_labels = ",".join(compatible_tools_ids) | 117 composite_labels = ",".join(compatible_tools_ids) |
| 103 train_tools = ",".join(train_tools) | 118 train_tools = ",".join(train_tools) |
| 104 if train_tools in paths_labels: | 119 if train_tools in paths_labels: |
| 105 paths_labels[train_tools] += "," + composite_labels | 120 paths_labels[train_tools] += "," + composite_labels |
| 125 for label_item in train_label.split(","): | 140 for label_item in train_label.split(","): |
| 126 label_mat[train_counter][int(label_item)] = 1.0 | 141 label_mat[train_counter][int(label_item)] = 1.0 |
| 127 train_counter += 1 | 142 train_counter += 1 |
| 128 return data_mat, label_mat | 143 return data_mat, label_mat |
| 129 | 144 |
| 130 def pad_paths(self, paths_dictionary, num_classes, standard_connections, reverse_dictionary): | 145 def pad_paths( |
| 146 self, paths_dictionary, num_classes, standard_connections, reverse_dictionary | |
| 147 ): | |
| 131 """ | 148 """ |
| 132 Add padding to the tools sequences and create multi-hot encoded labels | 149 Add padding to the tools sequences and create multi-hot encoded labels |
| 133 """ | 150 """ |
| 134 size_data = len(paths_dictionary) | 151 size_data = len(paths_dictionary) |
| 135 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | 152 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) |
| 229 if last_tool_id not in l_tool_tr_samples: | 246 if last_tool_id not in l_tool_tr_samples: |
| 230 l_tool_tr_samples[last_tool_id] = list() | 247 l_tool_tr_samples[last_tool_id] = list() |
| 231 l_tool_tr_samples[last_tool_id].append(index) | 248 l_tool_tr_samples[last_tool_id].append(index) |
| 232 return l_tool_tr_samples | 249 return l_tool_tr_samples |
| 233 | 250 |
| 234 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections, old_data_dictionary={}): | 251 def get_data_labels_matrices( |
| 252 self, | |
| 253 workflow_paths, | |
| 254 tool_usage_path, | |
| 255 cutoff_date, | |
| 256 compatible_next_tools, | |
| 257 standard_connections, | |
| 258 old_data_dictionary={}, | |
| 259 ): | |
| 235 """ | 260 """ |
| 236 Convert the training and test paths into corresponding numpy matrices | 261 Convert the training and test paths into corresponding numpy matrices |
| 237 """ | 262 """ |
| 238 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) | 263 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) |
| 239 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) | 264 dictionary, rev_dict = self.create_data_dictionary( |
| 265 processed_data, old_data_dictionary | |
| 266 ) | |
| 240 num_classes = len(dictionary) | 267 num_classes = len(dictionary) |
| 241 | 268 |
| 242 print("Raw paths: %d" % len(raw_paths)) | 269 print("Raw paths: %d" % len(raw_paths)) |
| 243 random.shuffle(raw_paths) | 270 random.shuffle(raw_paths) |
| 244 | 271 |
| 245 print("Decomposing paths...") | 272 print("Decomposing paths...") |
| 246 all_unique_paths = self.decompose_paths(raw_paths, dictionary) | 273 all_unique_paths = self.decompose_paths(raw_paths, dictionary) |
| 247 random.shuffle(all_unique_paths) | 274 random.shuffle(all_unique_paths) |
| 248 | 275 |
| 249 print("Creating dictionaries...") | 276 print("Creating dictionaries...") |
| 250 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, rev_dict, all_unique_paths, compatible_next_tools) | 277 multilabels_paths = self.prepare_paths_labels_dictionary( |
| 278 dictionary, rev_dict, all_unique_paths, compatible_next_tools | |
| 279 ) | |
| 251 | 280 |
| 252 print("Complete data: %d" % len(multilabels_paths)) | 281 print("Complete data: %d" % len(multilabels_paths)) |
| 253 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) | 282 train_paths_dict, test_paths_dict = self.split_test_train_data( |
| 283 multilabels_paths | |
| 284 ) | |
| 254 | 285 |
| 255 print("Train data: %d" % len(train_paths_dict)) | 286 print("Train data: %d" % len(train_paths_dict)) |
| 256 print("Test data: %d" % len(test_paths_dict)) | 287 print("Test data: %d" % len(test_paths_dict)) |
| 257 | 288 |
| 258 print("Padding train and test data...") | 289 print("Padding train and test data...") |
| 259 # pad training and test data with leading zeros | 290 # pad training and test data with leading zeros |
| 260 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict) | 291 test_data, test_labels = self.pad_paths( |
| 261 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict) | 292 test_paths_dict, num_classes, standard_connections, rev_dict |
| 293 ) | |
| 294 train_data, train_labels = self.pad_paths( | |
| 295 train_paths_dict, num_classes, standard_connections, rev_dict | |
| 296 ) | |
| 262 | 297 |
| 263 print("Estimating sample frequency...") | 298 print("Estimating sample frequency...") |
| 264 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) | 299 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) |
| 265 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) | 300 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) |
| 266 | 301 |
| 272 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) | 307 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) |
| 273 | 308 |
| 274 # get class weights using the predicted usage for each tool | 309 # get class weights using the predicted usage for each tool |
| 275 class_weights = self.assign_class_weights(num_classes, t_pred_usage) | 310 class_weights = self.assign_class_weights(num_classes, t_pred_usage) |
| 276 | 311 |
| 277 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, t_pred_usage, l_tool_freq, l_tool_tr_samples | 312 return ( |
| 313 train_data, | |
| 314 train_labels, | |
| 315 test_data, | |
| 316 test_labels, | |
| 317 dictionary, | |
| 318 rev_dict, | |
| 319 class_weights, | |
| 320 t_pred_usage, | |
| 321 l_tool_freq, | |
| 322 l_tool_tr_samples, | |
| 323 ) |
