# -*- Mode: Python; tab-width: 4 -*- # A simple table-based lexer framework. import string LexicalError = "LexicalError" # # possible improvements: # # verify completeness, i.e., verify that any character # delivered from any state is handled. # # use tables for character classes # class lexer: def __init__ (self, token_consumer, initial_state): # `token_consumer' is a method or function of two arguments, # and self.token_consumer = token_consumer self.clear_buffer() self.translate_machine (self.build_machine()) self.initial_state = self.set_state (initial_state) def set_state (self, state_key): "Set the machine's state. Return the index for that state" self.state = self.state_keys.index (state_key) return self.state # The machine created by 'build_machine()' is in a format # convenient for humans: it is a dictionary, using strings to name # states. This method converts the machine to one based on # tables. [i.e., states are numbers, which are in turn indices # into the state machine table]. def translate_machine (self, machine): # build a table version of the machine states = machine.keys() states.sort() skmap = {} for i in range(len(states)): skmap[states[i]] = i table = [] for state in states: entry = [] for char_class, action, next_state in machine[state]: if not skmap.has_key (next_state): raise ValueError, "undefined state: %s" % next_state entry.append ((char_class, action, skmap[next_state])) table.append (entry) self.table_machine = table self.state_keys = states def clear_buffer (self): self.buffer = [] def next_char (self, ch): for char_class, action, next_state in self.table_machine[self.state]: if (char_class is None) or (ch in char_class): if (type(action) == type('')): self.token_consumer ('punctuation', action) self.state = self.initial_state self.clear_buffer() return else: action (ch) self.state = next_state if (char_class is None) and (next_state == self.initial_state): # If we transition to the start state via wildcard, # then assume that we want to process the offending # character. self.next_char (ch) return raise LexicalError, "unhandled character: %s" % ch def action_drop (self, ch): pass def action_shift (self, ch): self.buffer.append (ch) def action_emit (self, ch): "Default emitter. Names the token class with the state key" self.token_consumer ( self.state_keys[self.state], string.join (self.buffer, '') ) self.clear_buffer() def action_error (self, ch): raise LexicalError, (self.state_keys[state], ch) # =========================================================================== # Everything below this point is lexer-specific. In other words, # derive from this class and override this stuff (or simply # rewrite it). # # The lexer below is a simple recognizer of identifiers, strings, and numbers. # =========================================================================== def emit_integer (self, ch): self.token_consumer ( 'integer', string.atoi (string.join (self.buffer, '')) ) self.clear_buffer() def emit_float (self, ch): self.token_consumer ( 'float', string.atof (string.join (self.buffer, '')) ) self.clear_buffer() def build_machine (self): # Character classes LETTER = string.letters DIGIT = string.digits WHITESPACE = string.whitespace PUNCTUATION = "()[]:=," QUOTE = "'\"" # actions DROP = self.action_drop SHIFT = self.action_shift ERROR = self.action_error EMIT = self.action_emit # Machine format is a dictionary. the keys are the state names # (though they don't _have_ to be strings). Each state is associated # with a list of possible transitions. Each transition is a tuple of # , , . # # If is 'None', it is a wildcard. # # is usually a procedure of two arguments. It is often a # 'bound method' (i.e., a method associated with a particular instance), # though it doesn't have to be. # # As a convenience, if is a string, then the string is emitted # as a token of class 'punctuation'. This simplifies the implementation # of multi-character punctuations like '!=', cutting down on the number # of states # # defines the state to transition to. # machine = { 'start' : ( # characters, action, next state (WHITESPACE, DROP, 'start' ), (LETTER, SHIFT, 'identifier' ), (DIGIT, SHIFT, 'integer' ), (QUOTE, DROP, 'string' ), (PUNCTUATION, SHIFT, 'punctuation' ), (None, ERROR, 'start' ) ), 'identifier' : ( (LETTER, SHIFT, 'identifier' ), (DIGIT, SHIFT, 'identifier' ), ('_', SHIFT, 'identifier' ), (None, EMIT, 'start' ), ), 'integer' : ( (DIGIT, SHIFT, 'integer' ), ('.', SHIFT, 'float' ), ('Ee', SHIFT, 'float2' ), (None, self.emit_integer, 'start' ), ), 'string' : ( ('\\', DROP, 'string2' ), (QUOTE, EMIT, 'start' ), (None, SHIFT, 'string' ), ), 'string2' : ( (None, SHIFT, 'string' ), ), 'float' : ( (DIGIT, SHIFT, 'float' ), ('Ee', SHIFT, 'float2' ), (None, self.emit_float, 'start' ), ), 'float2' : ( (DIGIT, SHIFT, 'float2' ), (None, self.emit_float, 'start' ), ), 'punctuation' : ( (None, EMIT, 'start' ), ) } return machine def test (): def print_token (c,t): print "class: %20s token: %s" % (repr(c), repr(t)) l = lexer(print_token, 'start') import sys sys.stdout.write ('input string: ') sys.stdout.flush() s = raw_input() for ch in s: l.next_char (ch) return l