Mercurial > repos > bgruening > create_tool_recommendation_model
comparison prepare_data.py @ 0:22ebbac136c7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
| author | bgruening |
|---|---|
| date | Wed, 28 Aug 2019 07:19:13 -0400 |
| parents | |
| children | 50753817983a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:22ebbac136c7 |
|---|---|
| 1 """ | |
| 2 Prepare the workflow paths to be used by downstream | |
| 3 machine learning algorithm. The paths are divided | |
| 4 into the test and training sets | |
| 5 """ | |
| 6 | |
| 7 import os | |
| 8 import collections | |
| 9 import numpy as np | |
| 10 import random | |
| 11 | |
| 12 import predict_tool_usage | |
| 13 | |
| 14 main_path = os.getcwd() | |
| 15 | |
| 16 | |
| 17 class PrepareData: | |
| 18 | |
| 19 @classmethod | |
| 20 def __init__(self, max_seq_length, test_data_share): | |
| 21 """ Init method. """ | |
| 22 self.max_tool_sequence_len = max_seq_length | |
| 23 self.test_share = test_data_share | |
| 24 | |
| 25 @classmethod | |
| 26 def process_workflow_paths(self, workflow_paths): | |
| 27 """ | |
| 28 Get all the tools and complete set of individual paths for each workflow | |
| 29 """ | |
| 30 tokens = list() | |
| 31 raw_paths = workflow_paths | |
| 32 raw_paths = [x.replace("\n", '') for x in raw_paths] | |
| 33 for item in raw_paths: | |
| 34 split_items = item.split(",") | |
| 35 for token in split_items: | |
| 36 if token is not "": | |
| 37 tokens.append(token) | |
| 38 tokens = list(set(tokens)) | |
| 39 tokens = np.array(tokens) | |
| 40 tokens = np.reshape(tokens, [-1, ]) | |
| 41 return tokens, raw_paths | |
| 42 | |
| 43 @classmethod | |
| 44 def create_new_dict(self, new_data_dict): | |
| 45 """ | |
| 46 Create new data dictionary | |
| 47 """ | |
| 48 reverse_dict = dict((v, k) for k, v in new_data_dict.items()) | |
| 49 return new_data_dict, reverse_dict | |
| 50 | |
| 51 @classmethod | |
| 52 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}): | |
| 53 """ | |
| 54 Create/update tools indices in the forward and backward dictionary | |
| 55 """ | |
| 56 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict) | |
| 57 return new_data_dict, reverse_dict | |
| 58 | |
| 59 @classmethod | |
| 60 def create_data_dictionary(self, words, old_data_dictionary={}): | |
| 61 """ | |
| 62 Create two dictionaries having tools names and their indexes | |
| 63 """ | |
| 64 count = collections.Counter(words).most_common() | |
| 65 dictionary = dict() | |
| 66 for word, _ in count: | |
| 67 dictionary[word] = len(dictionary) + 1 | |
| 68 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) | |
| 69 return dictionary, reverse_dictionary | |
| 70 | |
| 71 @classmethod | |
| 72 def decompose_paths(self, paths, dictionary): | |
| 73 """ | |
| 74 Decompose the paths to variable length sub-paths keeping the first tool fixed | |
| 75 """ | |
| 76 sub_paths_pos = list() | |
| 77 for index, item in enumerate(paths): | |
| 78 tools = item.split(",") | |
| 79 len_tools = len(tools) | |
| 80 if len_tools <= self.max_tool_sequence_len: | |
| 81 for window in range(1, len_tools): | |
| 82 sequence = tools[0: window + 1] | |
| 83 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] | |
| 84 if len(tools_pos) > 1: | |
| 85 sub_paths_pos.append(",".join(tools_pos)) | |
| 86 sub_paths_pos = list(set(sub_paths_pos)) | |
| 87 return sub_paths_pos | |
| 88 | |
| 89 @classmethod | |
| 90 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): | |
| 91 """ | |
| 92 Create a dictionary of sequences with their labels for training and test paths | |
| 93 """ | |
| 94 paths_labels = dict() | |
| 95 random.shuffle(paths) | |
| 96 for item in paths: | |
| 97 if item and item not in "": | |
| 98 tools = item.split(",") | |
| 99 label = tools[-1] | |
| 100 train_tools = tools[:len(tools) - 1] | |
| 101 last_but_one_name = reverse_dictionary[int(train_tools[-1])] | |
| 102 try: | |
| 103 compatible_tools = compatible_next_tools[last_but_one_name].split(",") | |
| 104 except Exception: | |
| 105 continue | |
| 106 if len(compatible_tools) > 0: | |
| 107 compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools] | |
| 108 compatible_tools_ids.append(label) | |
| 109 composite_labels = ",".join(compatible_tools_ids) | |
| 110 train_tools = ",".join(train_tools) | |
| 111 if train_tools in paths_labels: | |
| 112 paths_labels[train_tools] += "," + composite_labels | |
| 113 else: | |
| 114 paths_labels[train_tools] = composite_labels | |
| 115 for item in paths_labels: | |
| 116 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) | |
| 117 return paths_labels | |
| 118 | |
| 119 @classmethod | |
| 120 def pad_paths(self, paths_dictionary, num_classes): | |
| 121 """ | |
| 122 Add padding to the tools sequences and create multi-hot encoded labels | |
| 123 """ | |
| 124 size_data = len(paths_dictionary) | |
| 125 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | |
| 126 label_mat = np.zeros([size_data, num_classes + 1]) | |
| 127 train_counter = 0 | |
| 128 for train_seq, train_label in list(paths_dictionary.items()): | |
| 129 positions = train_seq.split(",") | |
| 130 start_pos = self.max_tool_sequence_len - len(positions) | |
| 131 for id_pos, pos in enumerate(positions): | |
| 132 data_mat[train_counter][start_pos + id_pos] = int(pos) | |
| 133 for label_item in train_label.split(","): | |
| 134 label_mat[train_counter][int(label_item)] = 1.0 | |
| 135 train_counter += 1 | |
| 136 return data_mat, label_mat | |
| 137 | |
| 138 @classmethod | |
| 139 def split_test_train_data(self, multilabels_paths): | |
| 140 """ | |
| 141 Split into test and train data randomly for each run | |
| 142 """ | |
| 143 train_dict = dict() | |
| 144 test_dict = dict() | |
| 145 all_paths = multilabels_paths.keys() | |
| 146 random.shuffle(list(all_paths)) | |
| 147 split_number = int(self.test_share * len(all_paths)) | |
| 148 for index, path in enumerate(list(all_paths)): | |
| 149 if index < split_number: | |
| 150 test_dict[path] = multilabels_paths[path] | |
| 151 else: | |
| 152 train_dict[path] = multilabels_paths[path] | |
| 153 return train_dict, test_dict | |
| 154 | |
| 155 @classmethod | |
| 156 def verify_overlap(self, train_paths, test_paths): | |
| 157 """ | |
| 158 Verify the overlapping of samples in train and test data | |
| 159 """ | |
| 160 intersection = list(set(train_paths).intersection(set(test_paths))) | |
| 161 print("Overlap in train and test: %d" % len(intersection)) | |
| 162 | |
| 163 @classmethod | |
| 164 def get_predicted_usage(self, data_dictionary, predicted_usage): | |
| 165 """ | |
| 166 Get predicted usage for tools | |
| 167 """ | |
| 168 usage = dict() | |
| 169 epsilon = 0.0 | |
| 170 # index 0 does not belong to any tool | |
| 171 usage[0] = epsilon | |
| 172 for k, v in data_dictionary.items(): | |
| 173 try: | |
| 174 usg = predicted_usage[k] | |
| 175 if usg < epsilon: | |
| 176 usg = epsilon | |
| 177 usage[v] = usg | |
| 178 except Exception: | |
| 179 usage[v] = epsilon | |
| 180 continue | |
| 181 return usage | |
| 182 | |
| 183 @classmethod | |
| 184 def assign_class_weights(self, n_classes, predicted_usage): | |
| 185 """ | |
| 186 Compute class weights using usage | |
| 187 """ | |
| 188 class_weights = dict() | |
| 189 class_weights[str(0)] = 0.0 | |
| 190 for key in range(1, n_classes): | |
| 191 u_score = predicted_usage[key] | |
| 192 if u_score < 1.0: | |
| 193 u_score += 1.0 | |
| 194 class_weights[key] = np.log(u_score) | |
| 195 return class_weights | |
| 196 | |
| 197 @classmethod | |
| 198 def get_sample_weights(self, train_data, reverse_dictionary, paths_frequency): | |
| 199 """ | |
| 200 Compute the frequency of paths in training data | |
| 201 """ | |
| 202 path_weights = np.zeros(len(train_data)) | |
| 203 for path_index, path in enumerate(train_data): | |
| 204 sample_pos = np.where(path > 0)[0] | |
| 205 sample_tool_pos = path[sample_pos[0]:] | |
| 206 path_name = ",".join([reverse_dictionary[int(tool_pos)] for tool_pos in sample_tool_pos]) | |
| 207 try: | |
| 208 path_weights[path_index] = int(paths_frequency[path_name]) | |
| 209 except Exception: | |
| 210 path_weights[path_index] = 1 | |
| 211 return path_weights | |
| 212 | |
| 213 @classmethod | |
| 214 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}): | |
| 215 """ | |
| 216 Convert the training and test paths into corresponding numpy matrices | |
| 217 """ | |
| 218 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) | |
| 219 dictionary, reverse_dictionary = self.create_data_dictionary(processed_data, old_data_dictionary) | |
| 220 num_classes = len(dictionary) | |
| 221 | |
| 222 print("Raw paths: %d" % len(raw_paths)) | |
| 223 random.shuffle(raw_paths) | |
| 224 | |
| 225 print("Decomposing paths...") | |
| 226 all_unique_paths = self.decompose_paths(raw_paths, dictionary) | |
| 227 random.shuffle(all_unique_paths) | |
| 228 | |
| 229 print("Creating dictionaries...") | |
| 230 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools) | |
| 231 | |
| 232 print("Complete data: %d" % len(multilabels_paths)) | |
| 233 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) | |
| 234 | |
| 235 print("Train data: %d" % len(train_paths_dict)) | |
| 236 print("Test data: %d" % len(test_paths_dict)) | |
| 237 | |
| 238 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes) | |
| 239 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes) | |
| 240 | |
| 241 # Predict tools usage | |
| 242 print("Predicting tools' usage...") | |
| 243 usage_pred = predict_tool_usage.ToolPopularity() | |
| 244 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) | |
| 245 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) | |
| 246 tool_predicted_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) | |
| 247 | |
| 248 # get class weights using the predicted usage for each tool | |
| 249 class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage) | |
| 250 | |
| 251 return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage |
