| 1 | n/a | """A bottom-up tree matching algorithm implementation meant to speed |
|---|
| 2 | n/a | up 2to3's matching process. After the tree patterns are reduced to |
|---|
| 3 | n/a | their rarest linear path, a linear Aho-Corasick automaton is |
|---|
| 4 | n/a | created. The linear automaton traverses the linear paths from the |
|---|
| 5 | n/a | leaves to the root of the AST and returns a set of nodes for further |
|---|
| 6 | n/a | matching. This reduces significantly the number of candidate nodes.""" |
|---|
| 7 | n/a | |
|---|
| 8 | n/a | __author__ = "George Boutsioukis <gboutsioukis@gmail.com>" |
|---|
| 9 | n/a | |
|---|
| 10 | n/a | import logging |
|---|
| 11 | n/a | import itertools |
|---|
| 12 | n/a | from collections import defaultdict |
|---|
| 13 | n/a | |
|---|
| 14 | n/a | from . import pytree |
|---|
| 15 | n/a | from .btm_utils import reduce_tree |
|---|
| 16 | n/a | |
|---|
| 17 | n/a | class BMNode(object): |
|---|
| 18 | n/a | """Class for a node of the Aho-Corasick automaton used in matching""" |
|---|
| 19 | n/a | count = itertools.count() |
|---|
| 20 | n/a | def __init__(self): |
|---|
| 21 | n/a | self.transition_table = {} |
|---|
| 22 | n/a | self.fixers = [] |
|---|
| 23 | n/a | self.id = next(BMNode.count) |
|---|
| 24 | n/a | self.content = '' |
|---|
| 25 | n/a | |
|---|
| 26 | n/a | class BottomMatcher(object): |
|---|
| 27 | n/a | """The main matcher class. After instantiating the patterns should |
|---|
| 28 | n/a | be added using the add_fixer method""" |
|---|
| 29 | n/a | |
|---|
| 30 | n/a | def __init__(self): |
|---|
| 31 | n/a | self.match = set() |
|---|
| 32 | n/a | self.root = BMNode() |
|---|
| 33 | n/a | self.nodes = [self.root] |
|---|
| 34 | n/a | self.fixers = [] |
|---|
| 35 | n/a | self.logger = logging.getLogger("RefactoringTool") |
|---|
| 36 | n/a | |
|---|
| 37 | n/a | def add_fixer(self, fixer): |
|---|
| 38 | n/a | """Reduces a fixer's pattern tree to a linear path and adds it |
|---|
| 39 | n/a | to the matcher(a common Aho-Corasick automaton). The fixer is |
|---|
| 40 | n/a | appended on the matching states and called when they are |
|---|
| 41 | n/a | reached""" |
|---|
| 42 | n/a | self.fixers.append(fixer) |
|---|
| 43 | n/a | tree = reduce_tree(fixer.pattern_tree) |
|---|
| 44 | n/a | linear = tree.get_linear_subpattern() |
|---|
| 45 | n/a | match_nodes = self.add(linear, start=self.root) |
|---|
| 46 | n/a | for match_node in match_nodes: |
|---|
| 47 | n/a | match_node.fixers.append(fixer) |
|---|
| 48 | n/a | |
|---|
| 49 | n/a | def add(self, pattern, start): |
|---|
| 50 | n/a | "Recursively adds a linear pattern to the AC automaton" |
|---|
| 51 | n/a | #print("adding pattern", pattern, "to", start) |
|---|
| 52 | n/a | if not pattern: |
|---|
| 53 | n/a | #print("empty pattern") |
|---|
| 54 | n/a | return [start] |
|---|
| 55 | n/a | if isinstance(pattern[0], tuple): |
|---|
| 56 | n/a | #alternatives |
|---|
| 57 | n/a | #print("alternatives") |
|---|
| 58 | n/a | match_nodes = [] |
|---|
| 59 | n/a | for alternative in pattern[0]: |
|---|
| 60 | n/a | #add all alternatives, and add the rest of the pattern |
|---|
| 61 | n/a | #to each end node |
|---|
| 62 | n/a | end_nodes = self.add(alternative, start=start) |
|---|
| 63 | n/a | for end in end_nodes: |
|---|
| 64 | n/a | match_nodes.extend(self.add(pattern[1:], end)) |
|---|
| 65 | n/a | return match_nodes |
|---|
| 66 | n/a | else: |
|---|
| 67 | n/a | #single token |
|---|
| 68 | n/a | #not last |
|---|
| 69 | n/a | if pattern[0] not in start.transition_table: |
|---|
| 70 | n/a | #transition did not exist, create new |
|---|
| 71 | n/a | next_node = BMNode() |
|---|
| 72 | n/a | start.transition_table[pattern[0]] = next_node |
|---|
| 73 | n/a | else: |
|---|
| 74 | n/a | #transition exists already, follow |
|---|
| 75 | n/a | next_node = start.transition_table[pattern[0]] |
|---|
| 76 | n/a | |
|---|
| 77 | n/a | if pattern[1:]: |
|---|
| 78 | n/a | end_nodes = self.add(pattern[1:], start=next_node) |
|---|
| 79 | n/a | else: |
|---|
| 80 | n/a | end_nodes = [next_node] |
|---|
| 81 | n/a | return end_nodes |
|---|
| 82 | n/a | |
|---|
| 83 | n/a | def run(self, leaves): |
|---|
| 84 | n/a | """The main interface with the bottom matcher. The tree is |
|---|
| 85 | n/a | traversed from the bottom using the constructed |
|---|
| 86 | n/a | automaton. Nodes are only checked once as the tree is |
|---|
| 87 | n/a | retraversed. When the automaton fails, we give it one more |
|---|
| 88 | n/a | shot(in case the above tree matches as a whole with the |
|---|
| 89 | n/a | rejected leaf), then we break for the next leaf. There is the |
|---|
| 90 | n/a | special case of multiple arguments(see code comments) where we |
|---|
| 91 | n/a | recheck the nodes |
|---|
| 92 | n/a | |
|---|
| 93 | n/a | Args: |
|---|
| 94 | n/a | The leaves of the AST tree to be matched |
|---|
| 95 | n/a | |
|---|
| 96 | n/a | Returns: |
|---|
| 97 | n/a | A dictionary of node matches with fixers as the keys |
|---|
| 98 | n/a | """ |
|---|
| 99 | n/a | current_ac_node = self.root |
|---|
| 100 | n/a | results = defaultdict(list) |
|---|
| 101 | n/a | for leaf in leaves: |
|---|
| 102 | n/a | current_ast_node = leaf |
|---|
| 103 | n/a | while current_ast_node: |
|---|
| 104 | n/a | current_ast_node.was_checked = True |
|---|
| 105 | n/a | for child in current_ast_node.children: |
|---|
| 106 | n/a | # multiple statements, recheck |
|---|
| 107 | n/a | if isinstance(child, pytree.Leaf) and child.value == ";": |
|---|
| 108 | n/a | current_ast_node.was_checked = False |
|---|
| 109 | n/a | break |
|---|
| 110 | n/a | if current_ast_node.type == 1: |
|---|
| 111 | n/a | #name |
|---|
| 112 | n/a | node_token = current_ast_node.value |
|---|
| 113 | n/a | else: |
|---|
| 114 | n/a | node_token = current_ast_node.type |
|---|
| 115 | n/a | |
|---|
| 116 | n/a | if node_token in current_ac_node.transition_table: |
|---|
| 117 | n/a | #token matches |
|---|
| 118 | n/a | current_ac_node = current_ac_node.transition_table[node_token] |
|---|
| 119 | n/a | for fixer in current_ac_node.fixers: |
|---|
| 120 | n/a | if not fixer in results: |
|---|
| 121 | n/a | results[fixer] = [] |
|---|
| 122 | n/a | results[fixer].append(current_ast_node) |
|---|
| 123 | n/a | |
|---|
| 124 | n/a | else: |
|---|
| 125 | n/a | #matching failed, reset automaton |
|---|
| 126 | n/a | current_ac_node = self.root |
|---|
| 127 | n/a | if (current_ast_node.parent is not None |
|---|
| 128 | n/a | and current_ast_node.parent.was_checked): |
|---|
| 129 | n/a | #the rest of the tree upwards has been checked, next leaf |
|---|
| 130 | n/a | break |
|---|
| 131 | n/a | |
|---|
| 132 | n/a | #recheck the rejected node once from the root |
|---|
| 133 | n/a | if node_token in current_ac_node.transition_table: |
|---|
| 134 | n/a | #token matches |
|---|
| 135 | n/a | current_ac_node = current_ac_node.transition_table[node_token] |
|---|
| 136 | n/a | for fixer in current_ac_node.fixers: |
|---|
| 137 | n/a | if not fixer in results.keys(): |
|---|
| 138 | n/a | results[fixer] = [] |
|---|
| 139 | n/a | results[fixer].append(current_ast_node) |
|---|
| 140 | n/a | |
|---|
| 141 | n/a | current_ast_node = current_ast_node.parent |
|---|
| 142 | n/a | return results |
|---|
| 143 | n/a | |
|---|
| 144 | n/a | def print_ac(self): |
|---|
| 145 | n/a | "Prints a graphviz diagram of the BM automaton(for debugging)" |
|---|
| 146 | n/a | print("digraph g{") |
|---|
| 147 | n/a | def print_node(node): |
|---|
| 148 | n/a | for subnode_key in node.transition_table.keys(): |
|---|
| 149 | n/a | subnode = node.transition_table[subnode_key] |
|---|
| 150 | n/a | print("%d -> %d [label=%s] //%s" % |
|---|
| 151 | n/a | (node.id, subnode.id, type_repr(subnode_key), str(subnode.fixers))) |
|---|
| 152 | n/a | if subnode_key == 1: |
|---|
| 153 | n/a | print(subnode.content) |
|---|
| 154 | n/a | print_node(subnode) |
|---|
| 155 | n/a | print_node(self.root) |
|---|
| 156 | n/a | print("}") |
|---|
| 157 | n/a | |
|---|
| 158 | n/a | # taken from pytree.py for debugging; only used by print_ac |
|---|
| 159 | n/a | _type_reprs = {} |
|---|
| 160 | n/a | def type_repr(type_num): |
|---|
| 161 | n/a | global _type_reprs |
|---|
| 162 | n/a | if not _type_reprs: |
|---|
| 163 | n/a | from .pygram import python_symbols |
|---|
| 164 | n/a | # printing tokens is possible but not as useful |
|---|
| 165 | n/a | # from .pgen2 import token // token.__dict__.items(): |
|---|
| 166 | n/a | for name, val in python_symbols.__dict__.items(): |
|---|
| 167 | n/a | if type(val) == int: _type_reprs[val] = name |
|---|
| 168 | n/a | return _type_reprs.setdefault(type_num, type_num) |
|---|