Mercurial > repos > bgruening > create_tool_recommendation_model
comparison extract_workflow_connections.py @ 0:22ebbac136c7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
| author | bgruening |
|---|---|
| date | Wed, 28 Aug 2019 07:19:13 -0400 |
| parents | |
| children | 50753817983a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:22ebbac136c7 |
|---|---|
| 1 """ | |
| 2 Extract workflow paths from the tabular file containing | |
| 3 input and output tools | |
| 4 """ | |
| 5 | |
| 6 import csv | |
| 7 import random | |
| 8 | |
| 9 import utils | |
| 10 | |
| 11 | |
| 12 class ExtractWorkflowConnections: | |
| 13 | |
| 14 @classmethod | |
| 15 def __init__(self): | |
| 16 """ Init method. """ | |
| 17 | |
| 18 @classmethod | |
| 19 def read_tabular_file(self, raw_file_path): | |
| 20 """ | |
| 21 Read tabular file and extract workflow connections | |
| 22 """ | |
| 23 print("Reading workflows...") | |
| 24 workflows = {} | |
| 25 workflow_paths_dup = "" | |
| 26 workflow_parents = dict() | |
| 27 workflow_paths = list() | |
| 28 unique_paths = list() | |
| 29 with open(raw_file_path, 'rt') as workflow_connections_file: | |
| 30 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t') | |
| 31 for index, row in enumerate(workflow_connections): | |
| 32 wf_id = str(row[0]) | |
| 33 in_tool = row[3] | |
| 34 out_tool = row[6] | |
| 35 if wf_id not in workflows: | |
| 36 workflows[wf_id] = list() | |
| 37 if out_tool and in_tool and out_tool != in_tool: | |
| 38 workflows[wf_id].append((in_tool, out_tool)) | |
| 39 print("Processing workflows...") | |
| 40 wf_ctr = 0 | |
| 41 for wf_id in workflows: | |
| 42 wf_ctr += 1 | |
| 43 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) | |
| 44 | |
| 45 for wf_id in workflow_parents: | |
| 46 flow_paths = list() | |
| 47 parents_graph = workflow_parents[wf_id] | |
| 48 roots, leaves = self.get_roots_leaves(parents_graph) | |
| 49 for root in roots: | |
| 50 for leaf in leaves: | |
| 51 paths = self.find_tool_paths_workflow(parents_graph, root, leaf) | |
| 52 # reverse the paths as they are computed from leaves to roots leaf | |
| 53 paths = [tool_path for tool_path in paths] | |
| 54 if len(paths) > 0: | |
| 55 flow_paths.extend(paths) | |
| 56 workflow_paths.extend(flow_paths) | |
| 57 | |
| 58 print("Workflows processed: %d" % wf_ctr) | |
| 59 | |
| 60 # remove slashes from the tool ids | |
| 61 wf_paths_no_slash = list() | |
| 62 for path in workflow_paths: | |
| 63 path_no_slash = [utils.format_tool_id(tool_id) for tool_id in path] | |
| 64 wf_paths_no_slash.append(path_no_slash) | |
| 65 | |
| 66 # collect duplicate paths | |
| 67 for path in wf_paths_no_slash: | |
| 68 workflow_paths_dup += ",".join(path) + "\n" | |
| 69 | |
| 70 # collect unique paths | |
| 71 unique_paths = list(workflow_paths_dup.split("\n")) | |
| 72 unique_paths = list(filter(None, unique_paths)) | |
| 73 random.shuffle(unique_paths) | |
| 74 no_dup_paths = list(set(unique_paths)) | |
| 75 | |
| 76 print("Finding compatible next tools...") | |
| 77 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) | |
| 78 return unique_paths, compatible_next_tools | |
| 79 | |
| 80 @classmethod | |
| 81 def set_compatible_next_tools(self, workflow_paths): | |
| 82 """ | |
| 83 Find next tools for each tool | |
| 84 """ | |
| 85 next_tools = dict() | |
| 86 for path in workflow_paths: | |
| 87 path_split = path.split(",") | |
| 88 for window in range(0, len(path_split) - 1): | |
| 89 current_next_tools = path_split[window: window + 2] | |
| 90 current_tool = current_next_tools[0] | |
| 91 next_tool = current_next_tools[1] | |
| 92 try: | |
| 93 next_tools[current_tool] += "," + next_tool | |
| 94 except Exception: | |
| 95 next_tools[current_tool] = next_tool | |
| 96 for tool in next_tools: | |
| 97 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) | |
| 98 return next_tools | |
| 99 | |
| 100 @classmethod | |
| 101 def read_workflow(self, wf_id, workflow_rows): | |
| 102 """ | |
| 103 Read all connections for a workflow | |
| 104 """ | |
| 105 tool_parents = dict() | |
| 106 for connection in workflow_rows: | |
| 107 in_tool = connection[0] | |
| 108 out_tool = connection[1] | |
| 109 if out_tool not in tool_parents: | |
| 110 tool_parents[out_tool] = list() | |
| 111 if in_tool not in tool_parents[out_tool]: | |
| 112 tool_parents[out_tool].append(in_tool) | |
| 113 return tool_parents | |
| 114 | |
| 115 @classmethod | |
| 116 def get_roots_leaves(self, graph): | |
| 117 roots = list() | |
| 118 leaves = list() | |
| 119 all_parents = list() | |
| 120 for item in graph: | |
| 121 all_parents.extend(graph[item]) | |
| 122 all_parents = list(set(all_parents)) | |
| 123 children = graph.keys() | |
| 124 roots = list(set(all_parents).difference(set(children))) | |
| 125 leaves = list(set(children).difference(set(all_parents))) | |
| 126 return roots, leaves | |
| 127 | |
| 128 @classmethod | |
| 129 def find_tool_paths_workflow(self, graph, start, end, path=[]): | |
| 130 path = path + [end] | |
| 131 if start == end: | |
| 132 return [path] | |
| 133 path_list = list() | |
| 134 if end in graph: | |
| 135 for node in graph[end]: | |
| 136 if node not in path: | |
| 137 new_tools_paths = self.find_tool_paths_workflow(graph, start, node, path) | |
| 138 for tool_path in new_tools_paths: | |
| 139 path_list.append(tool_path) | |
| 140 return path_list |
