Mercurial > repos > bgruening > create_tool_recommendation_model
comparison extract_workflow_connections.py @ 2:50753817983a draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
| author | bgruening |
|---|---|
| date | Sat, 09 May 2020 09:38:04 +0000 |
| parents | 22ebbac136c7 |
| children | f0da532be419 |
comparison
equal
deleted
inserted
replaced
| 1:275e98795e99 | 2:50753817983a |
|---|---|
| 9 import utils | 9 import utils |
| 10 | 10 |
| 11 | 11 |
| 12 class ExtractWorkflowConnections: | 12 class ExtractWorkflowConnections: |
| 13 | 13 |
| 14 @classmethod | |
| 15 def __init__(self): | 14 def __init__(self): |
| 16 """ Init method. """ | 15 """ Init method. """ |
| 17 | 16 |
| 18 @classmethod | 17 def collect_standard_connections(self, row): |
| 18 published = row[8] | |
| 19 deleted = row[9] | |
| 20 has_errors = row[10] | |
| 21 if published == "t" and deleted == "f" and has_errors == "f": | |
| 22 return True | |
| 23 return False | |
| 24 | |
| 19 def read_tabular_file(self, raw_file_path): | 25 def read_tabular_file(self, raw_file_path): |
| 20 """ | 26 """ |
| 21 Read tabular file and extract workflow connections | 27 Read tabular file and extract workflow connections |
| 22 """ | 28 """ |
| 23 print("Reading workflows...") | 29 print("Reading workflows...") |
| 24 workflows = {} | 30 workflows = {} |
| 25 workflow_paths_dup = "" | 31 workflow_paths_dup = "" |
| 26 workflow_parents = dict() | 32 workflow_parents = dict() |
| 27 workflow_paths = list() | 33 workflow_paths = list() |
| 28 unique_paths = list() | 34 unique_paths = dict() |
| 35 standard_connections = dict() | |
| 29 with open(raw_file_path, 'rt') as workflow_connections_file: | 36 with open(raw_file_path, 'rt') as workflow_connections_file: |
| 30 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t') | 37 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t') |
| 31 for index, row in enumerate(workflow_connections): | 38 for index, row in enumerate(workflow_connections): |
| 32 wf_id = str(row[0]) | 39 wf_id = str(row[0]) |
| 33 in_tool = row[3] | 40 in_tool = row[3] |
| 34 out_tool = row[6] | 41 out_tool = row[6] |
| 35 if wf_id not in workflows: | 42 if wf_id not in workflows: |
| 36 workflows[wf_id] = list() | 43 workflows[wf_id] = list() |
| 37 if out_tool and in_tool and out_tool != in_tool: | 44 if out_tool and in_tool and out_tool != in_tool: |
| 38 workflows[wf_id].append((in_tool, out_tool)) | 45 workflows[wf_id].append((out_tool, in_tool)) |
| 46 qc = self.collect_standard_connections(row) | |
| 47 if qc: | |
| 48 i_t = utils.format_tool_id(in_tool) | |
| 49 o_t = utils.format_tool_id(out_tool) | |
| 50 if i_t not in standard_connections: | |
| 51 standard_connections[i_t] = list() | |
| 52 if o_t not in standard_connections[i_t]: | |
| 53 standard_connections[i_t].append(o_t) | |
| 39 print("Processing workflows...") | 54 print("Processing workflows...") |
| 40 wf_ctr = 0 | 55 wf_ctr = 0 |
| 41 for wf_id in workflows: | 56 for wf_id in workflows: |
| 42 wf_ctr += 1 | 57 wf_ctr += 1 |
| 43 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) | 58 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) |
| 52 # reverse the paths as they are computed from leaves to roots leaf | 67 # reverse the paths as they are computed from leaves to roots leaf |
| 53 paths = [tool_path for tool_path in paths] | 68 paths = [tool_path for tool_path in paths] |
| 54 if len(paths) > 0: | 69 if len(paths) > 0: |
| 55 flow_paths.extend(paths) | 70 flow_paths.extend(paths) |
| 56 workflow_paths.extend(flow_paths) | 71 workflow_paths.extend(flow_paths) |
| 57 | |
| 58 print("Workflows processed: %d" % wf_ctr) | 72 print("Workflows processed: %d" % wf_ctr) |
| 59 | 73 |
| 60 # remove slashes from the tool ids | 74 # remove slashes from the tool ids |
| 61 wf_paths_no_slash = list() | 75 wf_paths_no_slash = list() |
| 62 for path in workflow_paths: | 76 for path in workflow_paths: |
| 73 random.shuffle(unique_paths) | 87 random.shuffle(unique_paths) |
| 74 no_dup_paths = list(set(unique_paths)) | 88 no_dup_paths = list(set(unique_paths)) |
| 75 | 89 |
| 76 print("Finding compatible next tools...") | 90 print("Finding compatible next tools...") |
| 77 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) | 91 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) |
| 78 return unique_paths, compatible_next_tools | 92 return unique_paths, compatible_next_tools, standard_connections |
| 79 | 93 |
| 80 @classmethod | |
| 81 def set_compatible_next_tools(self, workflow_paths): | 94 def set_compatible_next_tools(self, workflow_paths): |
| 82 """ | 95 """ |
| 83 Find next tools for each tool | 96 Find next tools for each tool |
| 84 """ | 97 """ |
| 85 next_tools = dict() | 98 next_tools = dict() |
| 95 next_tools[current_tool] = next_tool | 108 next_tools[current_tool] = next_tool |
| 96 for tool in next_tools: | 109 for tool in next_tools: |
| 97 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) | 110 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) |
| 98 return next_tools | 111 return next_tools |
| 99 | 112 |
| 100 @classmethod | |
| 101 def read_workflow(self, wf_id, workflow_rows): | 113 def read_workflow(self, wf_id, workflow_rows): |
| 102 """ | 114 """ |
| 103 Read all connections for a workflow | 115 Read all connections for a workflow |
| 104 """ | 116 """ |
| 105 tool_parents = dict() | 117 tool_parents = dict() |
| 110 tool_parents[out_tool] = list() | 122 tool_parents[out_tool] = list() |
| 111 if in_tool not in tool_parents[out_tool]: | 123 if in_tool not in tool_parents[out_tool]: |
| 112 tool_parents[out_tool].append(in_tool) | 124 tool_parents[out_tool].append(in_tool) |
| 113 return tool_parents | 125 return tool_parents |
| 114 | 126 |
| 115 @classmethod | |
| 116 def get_roots_leaves(self, graph): | 127 def get_roots_leaves(self, graph): |
| 117 roots = list() | 128 roots = list() |
| 118 leaves = list() | 129 leaves = list() |
| 119 all_parents = list() | 130 all_parents = list() |
| 120 for item in graph: | 131 for item in graph: |
| 123 children = graph.keys() | 134 children = graph.keys() |
| 124 roots = list(set(all_parents).difference(set(children))) | 135 roots = list(set(all_parents).difference(set(children))) |
| 125 leaves = list(set(children).difference(set(all_parents))) | 136 leaves = list(set(children).difference(set(all_parents))) |
| 126 return roots, leaves | 137 return roots, leaves |
| 127 | 138 |
| 128 @classmethod | |
| 129 def find_tool_paths_workflow(self, graph, start, end, path=[]): | 139 def find_tool_paths_workflow(self, graph, start, end, path=[]): |
| 130 path = path + [end] | 140 path = path + [end] |
| 131 if start == end: | 141 if start == end: |
| 132 return [path] | 142 return [path] |
| 133 path_list = list() | 143 path_list = list() |
