# -*- Mode: Python; tab-width: 4 -*- # state-machine lexer for python # TODO # different types of numbers: # octal, hex, long, imaginary... # string joining [parser?] # Notes: # no generated while in brackets. # blank lines are ignored. [thus they do not affect the indent/dedent stack] # also, they do not generate a newline token. # # tabs are not a fixed size: they represent from 1 to 8 characters. import string import lexer TAB_WIDTH = 8 class indent_stack: def __init__ (self): self.stack = [] self.column = 0 def push (self, num): self.stack.append (num) self.column = self.column + num def pop (self, num): # verify dedent consistency # may include 1 or more elements of the stack, # but must pop consistently dedents = 0 self.column = self.column - num while num: top = self.stack[-1] if num < top: return -1 elif num >= top: del self.stack[-1] dedents = dedents + 1 num = num - top return dedents def depth (self): return len(self.stack) def current_column (self): return self.column class python_lexer (lexer.lexer): def __init__ (self, token_consumer, initial_state): lexer.lexer.__init__ (self, token_consumer, initial_state) self.bracket = 0 self.indent_stack = indent_stack() def emit_integer (self, ch): self.token_consumer ( 'integer', string.atoi (string.join (self.buffer, '')) ) self.clear_buffer() def emit_long (self, ch): self.token_consumer ( 'long', string.atol (string.join (self.buffer, '')) ) self.clear_buffer() def emit_float (self, ch): self.token_consumer ( 'float', string.atof (string.join (self.buffer, '')) ) self.clear_buffer() def emit_punctuation (self, ch): # Note: is _not_ the punctuation character, but rather # the character taking us out of the punctuation state. punch = self.buffer[-1] lexer.lexer.action_emit (self, ch) # keep track of bracketing for indent/dedent purposes; # indentation is ignore when inside multi-line bracketed # expressions. if punch in "([{": self.bracket = self.bracket + 1 elif punch in ")]}": self.bracket = self.bracket - 1 keywords = string.split ( "and break class continue def del elif else except exec" "finally for from global if import in is lambda not or pass" "print raise return try while" ) def emit_name (self, ch): ident = string.join (self.buffer, '') if ident in self.keywords: char_class = 'keyword' else: char_class = 'name' self.token_consumer (char_class, ident) self.clear_buffer() def emit_string (self, ch): self.token_consumer ('string', string.join (self.buffer, '')) self.clear_buffer() def emit_triple_quote_string (self, ch): # toss the last three characters, they are quote # marks collected while in triple-quote mode. self.token_consumer ('string', string.join (self.buffer[:-3], '')) self.clear_buffer() def action_unrecog (self, ch): # We have seen an unrecognized escape sequence # in a string: keep the backslash and the strange # character. self.buffer = self.buffer + ['\\',ch] def action_front (self, ch): # whitespace at the beginning of a line is 'frontspace'. # use it to calculate the emission of indent/dedent tokens. if ch == '#': # ignore frontspace on comment lines self.clear_buffer() return if self.bracket: # if we're in the middle of a bracketed expression, # ignore the frontspace. self.clear_buffer() return # is the line blank? if not self.buffer and ch == '\n': # ignore it return else: # generate a newline token self.token_consumer ('punctuation', '') # calculate the indent column = 0 for space in self.buffer: if space == ' ': column = column + 1 elif space == '\t': column = (column/TAB_WIDTH + 1) * TAB_WIDTH else: raise lexer.LexicalError, "unknown whitespace character" ci = self.indent_stack.current_column() if (column > ci): self.token_consumer ('indent', None) self.indent_stack.push (column - ci) elif (column < ci): # check for consistent dedent dedents = self.indent_stack.pop (ci - column) if dedents == -1: raise lexer.LexicalError, "inconsistent dedent" else: for i in range(dedents): self.token_consumer ('dedent', None) self.clear_buffer() # ====================================================================== # actions for octal and hex escape sequences in strings current_oct_esc = [] def action_oct_esc (self, ch): self.current_oct_esc.append (ch) if len(self.current_oct_esc) > 3: raise lexer.LexicalError, 'octal escape sequence too long (must be 3 chars or less)' def emit_oct_esc (self, ch): ascii = chr ( string.atoi ( string.join (self.current_oct_esc, ''), 8 ) ) self.buffer.append (ascii) self.buffer.append (ch) self.current_oct_esc = [] current_hex_esc = [] def action_hex_esc (self, ch): self.current_hex_esc.append (ch) if len(self.current_hex_esc) > 3: raise lexer.LexicalError, 'hex escape sequence too long (must be 3 chars or less)' def emit_hex_esc (self, ch): # pay attention only to the lower 8 bits ascii = chr ( string.atoi ( string.join (self.current_hex_esc[-2:], ''), 16 ) ) self.buffer.append (ascii) self.buffer.append (ch) self.current_hex_esc = [] def action_ascii_esc (self, ch): self.buffer.append ( {'a':'\a', 'b':'\b', 'f':'\f', 'n':'\n', 'r':'\r', 't':'\t', 'v':'\v'}[ch] ) def build_machine (self): # Character classes LETTER = string.letters DIGIT = string.digits WHITESPACE = ' \t' NEWLINE = '\n\r' # characters that start a two-char punctuation TWOPUNCT = '<>!*=' # what's left ONEPUNCT = '.:,(){}[]+-/%^&' HEXDIGIT = string.hexdigits OCTDIGIT = string.octdigits # actions DROP = self.action_drop # drop a character SHIFT = self.action_shift # collect a character ERROR = self.action_error EMIT = self.action_emit FRONT = self.action_front UNRECOG = self.action_unrecog # special emitters PUNCT = self.emit_punctuation IDENT = self.emit_name FLOAT = self.emit_float INT = self.emit_integer LONG = self.emit_long STR = self.emit_string TQSTR = self.emit_triple_quote_string HEX_ESC = self.action_hex_esc OCT_ESC = self.action_oct_esc EMIT_HEX_ESC = self.emit_hex_esc EMIT_OCT_ESC = self.emit_oct_esc ASCII = self.action_ascii_esc machine = { 'start' : ( # characters, action, next state (WHITESPACE, DROP, 'whitespace' ), ('\\', DROP, 'linejoin' ), (NEWLINE, DROP, 'frontspace' ), (LETTER, SHIFT, 'name' ), ("_", SHIFT, 'name' ), (DIGIT, SHIFT, 'integer' ), ('"', DROP, 'dstring' ), ("'", DROP, 'sstring' ), ("<", SHIFT, 'lessthan' ), (">", SHIFT, 'greaterthan' ), ("!", SHIFT, 'bang' ), ("*", SHIFT, 'splat' ), ("=", SHIFT, 'equal' ), (ONEPUNCT, SHIFT, 'punctuation' ), ("#", DROP, 'comment' ), (None, ERROR, 'start' ) ), # whitespace at the front of a line 'frontspace' : ( (WHITESPACE, SHIFT, 'frontspace' ), (None, FRONT, 'start' ), ), # whitespace elsewhere 'whitespace' : ( (None, DROP, 'start' ), ), 'linejoin' : ( (NEWLINE, DROP, 'start' ), (None, ERROR, 'start' ), ), 'comment' : ( (NEWLINE, DROP, 'frontspace' ), (None, DROP, 'comment' ), ), # also recognizes keywords 'name' : ( (LETTER, SHIFT, 'name' ), (DIGIT, SHIFT, 'name' ), ('_', SHIFT, 'name' ), (None, IDENT, 'start' ), ), # numeric literals 'integer' : ( (DIGIT, SHIFT, 'integer' ), ('L', LONG, 'start' ), ('.', SHIFT, 'float' ), ('Ee', SHIFT, 'float2' ), (None, INT, 'start' ), ), 'float' : ( (DIGIT, SHIFT, 'float' ), ('Ee', SHIFT, 'float2' ), (None, FLOAT, 'start' ), ), 'float2' : ( (DIGIT, SHIFT, 'float2' ), (None, FLOAT, 'start' ), ), # Python uses a sophisticated string recognizer; # There are two types of strings: single or double-quote delimited. # Each of these can also be involved in a triple-quote drama. # ANSI C escape sequences are also supported. # # The machine for each string type is duplicated. # double-quote strings 'dstring' : ( ('\\', DROP, 'descape' ), ('"', DROP, 'dstring2' ), (None, SHIFT, 'dstring6' ), ), 'dstring2' : ( ('"', DROP, 'dstring3' ), (None, STR, 'start' ), ), 'dstring3' : ( ('"', SHIFT, 'dstring4' ), (None, SHIFT, 'dstring3' ), ), 'dstring4' : ( ('"', SHIFT, 'dstring5' ), (None, SHIFT, 'dstring3' ), ), 'dstring5' : ( ('"', TQSTR, 'start' ), (None, SHIFT, 'dstring3' ), ), 'dstring6' : ( ('\\', DROP, 'dstring7' ), ('"', STR, 'start' ), (None, SHIFT, 'dstring6' ), ), 'dstring7' : ( (None, SHIFT, 'dstring6' ), ), # identical to the 'dstring' states, but with single quotes 'sstring' : ( ('\\', DROP, 'sescape' ), ("'", DROP, 'sstring2' ), (None, SHIFT, 'sstring' ), ), 'sstring2' : ( ("'", DROP, 'sstring3' ), (None, STR, 'start' ), ), 'sstring3' : ( ("'", SHIFT, 'sstring4' ), (None, SHIFT, 'sstring3' ), ), 'sstring4' : ( ("'", SHIFT, 'sstring5' ), (None, SHIFT, 'sstring3' ), ), 'sstring5' : ( ("'", TQSTR, 'start' ), (None, SHIFT, 'sstring3' ), ), 'sstring6' : ( ("'", STR, 'start' ), ('\\', DROP, 'sstring7' ), (None, SHIFT, 'sstring6' ), ), 'sstring7' : ( (None, SHIFT, 'sstring6' ), ), # ANSI C escape sequences 'descape' : ( (NEWLINE, DROP, 'dstring' ), ('\\\'\"', SHIFT, 'dstring' ), ('abfnrtv', ASCII, 'dstring' ), ('01', OCT_ESC, 'doct_esc' ), ('x', HEX_ESC, 'dhex_esc' ), (None, UNRECOG, 'dstring' ), ), 'doct_esc' : ( (OCTDIGIT, OCT_ESC, 'doct_esc' ), (None, EMIT_OCT_ESC, 'dstring' ), ), 'dhex_esc' : ( (HEXDIGIT, HEX_ESC, 'dhex_esc' ), (None, EMIT_HEX_ESC, 'dstring' ), ), # duplicated for single-quote strings 'sescape' : ( (NEWLINE, DROP, 'sstring' ), ('\\\'\"', SHIFT, 'sstring' ), ('abfnrtv', ASCII, 'sstring' ), ('01', OCT_ESC, 'soct_esc' ), ('x', HEX_ESC, 'shex_esc' ), (None, UNRECOG, 'sstring' ), ), 'soct_esc' : ( (OCTDIGIT, OCT_ESC, 'soct_esc' ), (None, EMIT_OCT_ESC, 'sstring' ), ), 'shex_esc' : ( (HEXDIGIT, HEX_ESC, 'shex_esc' ), (None, EMIT_HEX_ESC, 'sstring' ), ), # Punctuation. Some are made of two characters. 'punctuation' : ( (None, PUNCT, 'start' ), ), 'lessthan' : ( ('>', '!=', 'start' ), ('<', '<<', 'start' ), ('=', '<=', 'start' ), (None, PUNCT, 'start' ), ), 'greaterthan' : ( ('>', '>>', 'start' ), ('=', '>=', 'start' ), (None, PUNCT, 'start' ), ), 'bang' : ( ('=', '!=', 'start' ), (None, PUNCT, 'start' ), ), 'splat' : ( ('*', '**', 'start' ), (None, PUNCT, 'start' ), ), 'equal' : ( ('=', '==', 'start' ), (None, PUNCT, 'start' ), ), } return machine sample = """ # snide class middle: def madness (did,did_not,did_so,shut_up): print "nevermore" return 1<<2+did class lower: def madness (): print "quoth" return a != b thing = None """ # indent stack: # # 0 |class snout: # 2 i | def method (stuff): # 2 2 i | if test: # 2 2 2 i | do_thing() # 2 2 d | else: # 2 2 2 i | do_other_thing() # | # 2 dd | def other_method (other_stuff): # 2 2 i | yet_more_stuff() # | # 0 dd |thing = None def print_token (c,t): print "class: %20s token: %s" % (repr(c), repr(t)) class token_collector: def __init__ (self): self.tokens = [] def collect (self, token_class, token_data): self.tokens.append (token_class, token_data) # scan a string def test (s=sample, consumer=print_token): l = python_lexer (consumer, 'start') for ch in s: l.next_char (ch) return l # scan a file def test2 (filename='python_lexer.py', consumer=print_token): l = python_lexer (consumer, 'start') for ch in open(filename, 'rb').read(): l.next_char (ch) return l # collect the tokens from a file def test3 (filename='python_lexer.py'): tc = token_collector() test2 (filename,tc.collect) return tc.tokens def generate_gml (file, lexer): file.write ('graph [\n') file.write (' directed 1\n') table = lexer.table_machine labels = lexer.state_keys nstates = len(table) # emit node info for s in range(nstates): file.write (' node [ id %d label "%s" ]\n' % (s,labels[s])) # emit edge info for s in range(nstates): for c,a,n in table[s]: file.write (' edge [ source %d target %d ]\n' % (s,n)) file.write (']\n') if __name__ == '__main__': test2()