|
|
# Copyright (c) 1998-2002 John Aycock## Permission is hereby granted, free of charge, to any person obtaining# a copy of this software and associated documentation files (the# "Software"), to deal in the Software without restriction, including# without limitation the rights to use, copy, modify, merge, publish,# distribute, sublicense, and/or sell copies of the Software, and to# permit persons to whom the Software is furnished to do so, subject to# the following conditions:## The above copyright notice and this permission notice shall be# included in all copies or substantial portions of the Software.## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
__version__ = 'SPARK-0.7 (pre-alpha-5)'
import re
# Compatibility with older pythons.def output(string='', end='\n'): sys.stdout.write(string + end)
try: sortedexcept NameError: def sorted(seq): seq2 = seq[:] seq2.sort() return seq2
def _namelist(instance): namelist, namedict, classlist = [], {}, [instance.__class__] for c in classlist: for b in c.__bases__: classlist.append(b) for name in c.__dict__.keys(): if name not in namedict: namelist.append(name) namedict[name] = 1 return namelist
class GenericScanner: def __init__(self, flags=0): pattern = self.reflect() self.re = re.compile(pattern, re.VERBOSE|flags)
self.index2func = {} for name, number in self.re.groupindex.items(): self.index2func[number-1] = getattr(self, 't_' + name)
def makeRE(self, name): doc = getattr(self, name).__doc__ rv = '(?P<%s>%s)' % (name[2:], doc) return rv
def reflect(self): rv = [] for name in _namelist(self): if name[:2] == 't_' and name != 't_default': rv.append(self.makeRE(name))
rv.append(self.makeRE('t_default')) return '|'.join(rv)
def error(self, s, pos): output("Lexical error at position %s" % pos) raise SystemExit
def tokenize(self, s): pos = 0 n = len(s) while pos < n: m = self.re.match(s, pos) if m is None: self.error(s, pos)
groups = m.groups() for i in range(len(groups)): if groups[i] and i in self.index2func: self.index2func[i](groups[i]) pos = m.end()
def t_default(self, s): r'( . | \n )+' output("Specification error: unmatched input") raise SystemExit
## Extracted from GenericParser and made global so that [un]picking works.#class _State: def __init__(self, stateno, items): self.T, self.complete, self.items = [], [], items self.stateno = stateno
class GenericParser: # # An Earley parser, as per J. Earley, "An Efficient Context-Free # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, # Carnegie-Mellon University, August 1968. New formulation of # the parser according to J. Aycock, "Practical Earley Parsing # and the SPARK Toolkit", Ph.D. thesis, University of Victoria, # 2001, and J. Aycock and R. N. Horspool, "Practical Earley # Parsing", unpublished paper, 2001. #
def __init__(self, start): self.rules = {} self.rule2func = {} self.rule2name = {} self.collectRules() self.augment(start) self.ruleschanged = 1
_NULLABLE = '\e_' _START = 'START' _BOF = '|-'
# # When pickling, take the time to generate the full state machine; # some information is then extraneous, too. Unfortunately we # can't save the rule2func map. # def __getstate__(self): if self.ruleschanged: # # XXX - duplicated from parse() # self.computeNull() self.newrules = {} self.new2old = {} self.makeNewRules() self.ruleschanged = 0 self.edges, self.cores = {}, {} self.states = { 0: self.makeState0() } self.makeState(0, self._BOF) # # XXX - should find a better way to do this.. # changes = 1 while changes: changes = 0 for k, v in self.edges.items(): if v is None: state, sym = k if state in self.states: self.goto(state, sym) changes = 1 rv = self.__dict__.copy() for s in self.states.values(): del s.items del rv['rule2func'] del rv['nullable'] del rv['cores'] return rv
def __setstate__(self, D): self.rules = {} self.rule2func = {} self.rule2name = {} self.collectRules() start = D['rules'][self._START][0][1][1] # Blech. self.augment(start) D['rule2func'] = self.rule2func D['makeSet'] = self.makeSet_fast self.__dict__ = D
# # A hook for GenericASTBuilder and GenericASTMatcher. Mess # thee not with this; nor shall thee toucheth the _preprocess # argument to addRule. # def preprocess(self, rule, func): return rule, func
def addRule(self, doc, func, _preprocess=1): fn = func rules = doc.split()
index = [] for i in range(len(rules)): if rules[i] == '::=': index.append(i-1) index.append(len(rules))
for i in range(len(index)-1): lhs = rules[index[i]] rhs = rules[index[i]+2:index[i+1]] rule = (lhs, tuple(rhs))
if _preprocess: rule, fn = self.preprocess(rule, func)
if lhs in self.rules: self.rules[lhs].append(rule) else: self.rules[lhs] = [ rule ] self.rule2func[rule] = fn self.rule2name[rule] = func.__name__[2:] self.ruleschanged = 1
def collectRules(self): for name in _namelist(self): if name[:2] == 'p_': func = getattr(self, name) doc = func.__doc__ self.addRule(doc, func)
def augment(self, start): rule = '%s ::= %s %s' % (self._START, self._BOF, start) self.addRule(rule, lambda args: args[1], 0)
def computeNull(self): self.nullable = {} tbd = []
for rulelist in self.rules.values(): lhs = rulelist[0][0] self.nullable[lhs] = 0 for rule in rulelist: rhs = rule[1] if len(rhs) == 0: self.nullable[lhs] = 1 continue # # We only need to consider rules which # consist entirely of nonterminal symbols. # This should be a savings on typical # grammars. # for sym in rhs: if sym not in self.rules: break else: tbd.append(rule) changes = 1 while changes: changes = 0 for lhs, rhs in tbd: if self.nullable[lhs]: continue for sym in rhs: if not self.nullable[sym]: break else: self.nullable[lhs] = 1 changes = 1
def makeState0(self): s0 = _State(0, []) for rule in self.newrules[self._START]: s0.items.append((rule, 0)) return s0
def finalState(self, tokens): # # Yuck. # if len(self.newrules[self._START]) == 2 and len(tokens) == 0: return 1 start = self.rules[self._START][0][1][1] return self.goto(1, start)
def makeNewRules(self): worklist = [] for rulelist in self.rules.values(): for rule in rulelist: worklist.append((rule, 0, 1, rule))
for rule, i, candidate, oldrule in worklist: lhs, rhs = rule n = len(rhs) while i < n: sym = rhs[i] if sym not in self.rules or \ not self.nullable[sym]: candidate = 0 i = i + 1 continue
newrhs = list(rhs) newrhs[i] = self._NULLABLE+sym newrule = (lhs, tuple(newrhs)) worklist.append((newrule, i+1, candidate, oldrule)) candidate = 0 i = i + 1 else: if candidate: lhs = self._NULLABLE+lhs rule = (lhs, rhs) if lhs in self.newrules: self.newrules[lhs].append(rule) else: self.newrules[lhs] = [ rule ] self.new2old[rule] = oldrule
def typestring(self, token): return None
def error(self, token): output("Syntax error at or near `%s' token" % token) raise SystemExit
def parse(self, tokens): sets = [ [(1,0), (2,0)] ] self.links = {}
if self.ruleschanged: self.computeNull() self.newrules = {} self.new2old = {} self.makeNewRules() self.ruleschanged = 0 self.edges, self.cores = {}, {} self.states = { 0: self.makeState0() } self.makeState(0, self._BOF)
for i in range(len(tokens)): sets.append([])
if sets[i] == []: break self.makeSet(tokens[i], sets, i) else: sets.append([]) self.makeSet(None, sets, len(tokens))
#_dump(tokens, sets, self.states)
finalitem = (self.finalState(tokens), 0) if finalitem not in sets[-2]: if len(tokens) > 0: self.error(tokens[i-1]) else: self.error(None)
return self.buildTree(self._START, finalitem, tokens, len(sets)-2)
def isnullable(self, sym): # # For symbols in G_e only. If we weren't supporting 1.5, # could just use sym.startswith(). # return self._NULLABLE == sym[0:len(self._NULLABLE)]
def skip(self, hs, pos=0): n = len(hs[1]) while pos < n: if not self.isnullable(hs[1][pos]): break pos = pos + 1 return pos
def makeState(self, state, sym): assert sym is not None # # Compute \epsilon-kernel state's core and see if # it exists already. # kitems = [] for rule, pos in self.states[state].items: lhs, rhs = rule if rhs[pos:pos+1] == (sym,): kitems.append((rule, self.skip(rule, pos+1))) core = kitems
core.sort() tcore = tuple(core) if tcore in self.cores: return self.cores[tcore] # # Nope, doesn't exist. Compute it and the associated # \epsilon-nonkernel state together; we'll need it right away. # k = self.cores[tcore] = len(self.states) K, NK = _State(k, kitems), _State(k+1, []) self.states[k] = K predicted = {}
edges = self.edges rules = self.newrules for X in K, NK: worklist = X.items for item in worklist: rule, pos = item lhs, rhs = rule if pos == len(rhs): X.complete.append(rule) continue
nextSym = rhs[pos] key = (X.stateno, nextSym) if nextSym not in rules: if key not in edges: edges[key] = None X.T.append(nextSym) else: edges[key] = None if nextSym not in predicted: predicted[nextSym] = 1 for prule in rules[nextSym]: ppos = self.skip(prule) new = (prule, ppos) NK.items.append(new) # # Problem: we know K needs generating, but we # don't yet know about NK. Can't commit anything # regarding NK to self.edges until we're sure. Should # we delay committing on both K and NK to avoid this # hacky code? This creates other problems.. # if X is K: edges = {}
if NK.items == []: return k
# # Check for \epsilon-nonkernel's core. Unfortunately we # need to know the entire set of predicted nonterminals # to do this without accidentally duplicating states. # core = sorted(predicted.keys()) tcore = tuple(core) if tcore in self.cores: self.edges[(k, None)] = self.cores[tcore] return k
nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno self.edges.update(edges) self.states[nk] = NK return k
def goto(self, state, sym): key = (state, sym) if key not in self.edges: # # No transitions from state on sym. # return None
rv = self.edges[key] if rv is None: # # Target state isn't generated yet. Remedy this. # rv = self.makeState(state, sym) self.edges[key] = rv return rv
def gotoT(self, state, t): return [self.goto(state, t)]
def gotoST(self, state, st): rv = [] for t in self.states[state].T: if st == t: rv.append(self.goto(state, t)) return rv
def add(self, set, item, i=None, predecessor=None, causal=None): if predecessor is None: if item not in set: set.append(item) else: key = (item, i) if item not in set: self.links[key] = [] set.append(item) self.links[key].append((predecessor, causal))
def makeSet(self, token, sets, i): cur, next = sets[i], sets[i+1]
ttype = token is not None and self.typestring(token) or None if ttype is not None: fn, arg = self.gotoT, ttype else: fn, arg = self.gotoST, token
for item in cur: ptr = (item, i) state, parent = item add = fn(state, arg) for k in add: if k is not None: self.add(next, (k, parent), i+1, ptr) nk = self.goto(k, None) if nk is not None: self.add(next, (nk, i+1))
if parent == i: continue
for rule in self.states[state].complete: lhs, rhs = rule for pitem in sets[parent]: pstate, pparent = pitem k = self.goto(pstate, lhs) if k is not None: why = (item, i, rule) pptr = (pitem, parent) self.add(cur, (k, pparent), i, pptr, why) nk = self.goto(k, None) if nk is not None: self.add(cur, (nk, i))
def makeSet_fast(self, token, sets, i): # # Call *only* when the entire state machine has been built! # It relies on self.edges being filled in completely, and # then duplicates and inlines code to boost speed at the # cost of extreme ugliness. # cur, next = sets[i], sets[i+1] ttype = token is not None and self.typestring(token) or None
for item in cur: ptr = (item, i) state, parent = item if ttype is not None: k = self.edges.get((state, ttype), None) if k is not None: #self.add(next, (k, parent), i+1, ptr) #INLINED --v new = (k, parent) key = (new, i+1) if new not in next: self.links[key] = [] next.append(new) self.links[key].append((ptr, None)) #INLINED --^ #nk = self.goto(k, None) nk = self.edges.get((k, None), None) if nk is not None: #self.add(next, (nk, i+1)) #INLINED --v new = (nk, i+1) if new not in next: next.append(new) #INLINED --^ else: add = self.gotoST(state, token) for k in add: if k is not None: self.add(next, (k, parent), i+1, ptr) #nk = self.goto(k, None) nk = self.edges.get((k, None), None) if nk is not None: self.add(next, (nk, i+1))
if parent == i: continue
for rule in self.states[state].complete: lhs, rhs = rule for pitem in sets[parent]: pstate, pparent = pitem #k = self.goto(pstate, lhs) k = self.edges.get((pstate, lhs), None) if k is not None: why = (item, i, rule) pptr = (pitem, parent) #self.add(cur, (k, pparent), # i, pptr, why) #INLINED --v new = (k, pparent) key = (new, i) if new not in cur: self.links[key] = [] cur.append(new) self.links[key].append((pptr, why)) #INLINED --^ #nk = self.goto(k, None) nk = self.edges.get((k, None), None) if nk is not None: #self.add(cur, (nk, i)) #INLINED --v new = (nk, i) if new not in cur: cur.append(new) #INLINED --^
def predecessor(self, key, causal): for p, c in self.links[key]: if c == causal: return p assert 0
def causal(self, key): links = self.links[key] if len(links) == 1: return links[0][1] choices = [] rule2cause = {} for p, c in links: rule = c[2] choices.append(rule) rule2cause[rule] = c return rule2cause[self.ambiguity(choices)]
def deriveEpsilon(self, nt): if len(self.newrules[nt]) > 1: rule = self.ambiguity(self.newrules[nt]) else: rule = self.newrules[nt][0] #output(rule)
rhs = rule[1] attr = [None] * len(rhs)
for i in range(len(rhs)-1, -1, -1): attr[i] = self.deriveEpsilon(rhs[i]) return self.rule2func[self.new2old[rule]](attr)
def buildTree(self, nt, item, tokens, k): state, parent = item
choices = [] for rule in self.states[state].complete: if rule[0] == nt: choices.append(rule) rule = choices[0] if len(choices) > 1: rule = self.ambiguity(choices) #output(rule)
rhs = rule[1] attr = [None] * len(rhs)
for i in range(len(rhs)-1, -1, -1): sym = rhs[i] if sym not in self.newrules: if sym != self._BOF: attr[i] = tokens[k-1] key = (item, k) item, k = self.predecessor(key, None) #elif self.isnullable(sym): elif self._NULLABLE == sym[0:len(self._NULLABLE)]: attr[i] = self.deriveEpsilon(sym) else: key = (item, k) why = self.causal(key) attr[i] = self.buildTree(sym, why[0], tokens, why[1]) item, k = self.predecessor(key, why) return self.rule2func[self.new2old[rule]](attr)
def ambiguity(self, rules): # # XXX - problem here and in collectRules() if the same rule # appears in >1 method. Also undefined results if rules # causing the ambiguity appear in the same method. # sortlist = [] name2index = {} for i in range(len(rules)): lhs, rhs = rule = rules[i] name = self.rule2name[self.new2old[rule]] sortlist.append((len(rhs), name)) name2index[name] = i sortlist.sort() list = [b for a, b in sortlist] return rules[name2index[self.resolve(list)]]
def resolve(self, list): # # Resolve ambiguity in favor of the shortest RHS. # Since we walk the tree from the top down, this # should effectively resolve in favor of a "shift". # return list[0]
## GenericASTBuilder automagically constructs a concrete/abstract syntax tree# for a given input. The extra argument is a class (not an instance!)# which supports the "__setslice__" and "__len__" methods.## XXX - silently overrides any user code in methods.#
class GenericASTBuilder(GenericParser): def __init__(self, AST, start): GenericParser.__init__(self, start) self.AST = AST
def preprocess(self, rule, func): rebind = lambda lhs, self=self: \ lambda args, lhs=lhs, self=self: \ self.buildASTNode(args, lhs) lhs, rhs = rule return rule, rebind(lhs)
def buildASTNode(self, args, lhs): children = [] for arg in args: if isinstance(arg, self.AST): children.append(arg) else: children.append(self.terminal(arg)) return self.nonterminal(lhs, children)
def terminal(self, token): return token
def nonterminal(self, type, args): rv = self.AST(type) rv[:len(args)] = args return rv
## GenericASTTraversal is a Visitor pattern according to Design Patterns. For# each node it attempts to invoke the method n_<node type>, falling# back onto the default() method if the n_* can't be found. The preorder# traversal also looks for an exit hook named n_<node type>_exit (no default# routine is called if it's not found). To prematurely halt traversal# of a subtree, call the prune() method -- this only makes sense for a# preorder traversal. Node type is determined via the typestring() method.#
class GenericASTTraversalPruningException: pass
class GenericASTTraversal: def __init__(self, ast): self.ast = ast
def typestring(self, node): return node.type
def prune(self): raise GenericASTTraversalPruningException
def preorder(self, node=None): if node is None: node = self.ast
try: name = 'n_' + self.typestring(node) if hasattr(self, name): func = getattr(self, name) func(node) else: self.default(node) except GenericASTTraversalPruningException: return
for kid in node: self.preorder(kid)
name = name + '_exit' if hasattr(self, name): func = getattr(self, name) func(node)
def postorder(self, node=None): if node is None: node = self.ast
for kid in node: self.postorder(kid)
name = 'n_' + self.typestring(node) if hasattr(self, name): func = getattr(self, name) func(node) else: self.default(node)
def default(self, node): pass
## GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"# implemented.## XXX - makes assumptions about how GenericParser walks the parse tree.#
class GenericASTMatcher(GenericParser): def __init__(self, start, ast): GenericParser.__init__(self, start) self.ast = ast
def preprocess(self, rule, func): rebind = lambda func, self=self: \ lambda args, func=func, self=self: \ self.foundMatch(args, func) lhs, rhs = rule rhslist = list(rhs) rhslist.reverse()
return (lhs, tuple(rhslist)), rebind(func)
def foundMatch(self, args, func): func(args[-1]) return args[-1]
def match_r(self, node): self.input.insert(0, node) children = 0
for child in node: if children == 0: self.input.insert(0, '(') children = children + 1 self.match_r(child)
if children > 0: self.input.insert(0, ')')
def match(self, ast=None): if ast is None: ast = self.ast self.input = []
self.match_r(ast) self.parse(self.input)
def resolve(self, list): # # Resolve ambiguity in favor of the longest RHS. # return list[-1]
def _dump(tokens, sets, states): for i in range(len(sets)): output('set %d' % i) for item in sets[i]: output('\t', item) for (lhs, rhs), pos in states[item[0]].items: output('\t\t', lhs, '::=', end='') output(' '.join(rhs[:pos]), end='') output('.', end='') output(' '.join(rhs[pos:])) if i < len(tokens): output() output('token %s' % str(tokens[i])) output()
|