# -*- Mode: Python -*-

import charset

# this is so they print out pretty
class operator:
    def __init__ (self, s):
        self.s = s
    def __repr__ (self):
        return self.s

# operators
CONCAT   = operator ('.')
STAR     = operator ('*')
OR       = operator ('|')
PLUS     = operator ('+')
OPTIONAL = operator ('?')

# token, used by lexer
TOKEN    = operator ('TOKEN')

# (ab|ba)*
# (star (or (concat "a" "b") (concat "b" "a")))

r1 = '(ab|ba)*'
r2 = '(\r\n.\r\n)|(\n.\n)|(\r.\r)'
r3 = 'a+(.a+)+$'

def pprint_regexp (exp):
    if type(exp) == type(''):
        return exp
    else:
        op = exp[0]
        args = map (pprint_regexp, exp[1:])
        if op is CONCAT:
            return ''.join (args)
        elif op is STAR:
            return '(' + args[0] + ')*'
        elif op is OR:
            return '(' + '|'.join (args) + ')'
        else:
            raise ValueError, 'Unknown operator "%s"' % op

def nary_op (l, op):
    "apply binary OP to the <n> expressions of <l>"
    if len(l) == 1:
        return l[0]
    elif len(l) == 2:
        return (op, l[0], l[1])
    else:
        return (op, l[0], nary_op (l[1:], op))

def n_concat (l):
    "CONCAT the <n> expressions of <l> using only the binary operator"
    return nary_op (l, CONCAT)

def n_or (l):
    return nary_op (l, OR)

def parse (s, pos=0):
    """parse a regular expression into tree format."""
    exp, pos = _parse(s,pos)
    return exp

# abc+def
# parse a series of atoms, each to be CONCAT'd together

def _parse (s, pos=0):
    exp = []
    while 1:
        if pos >= len(s):
            return n_concat (exp), pos
        else:
            ch = s[pos]
            if ch == '(':
                sub, pos = _parse (s, pos+1)
                exp.append (sub)
            elif ch == ')':
                return n_concat (exp), pos
            elif ch == '*':
                # apply STAR to the last expression
                exp[-1] = (STAR, exp[-1])
            elif ch == '+':
                # apply PLUS to the last expression
                exp[-1] = (PLUS, exp[-1])
            elif ch == '|':
                # apply OR to the left side vs. whatever follows
                # note: this is more 'greedy' than +,?, or *
                sub, pos = _parse (s, pos+1)
                return (OR, n_concat (exp), sub), pos
            elif ch == '?':
                # apply OPTIONAL to the last expression
                exp[-1] = (OPTIONAL, exp[-1])
            elif ch == '[':
                # charset
                set, pos = charset.parse_charset (s, pos + 1)
                exp.append (set)
            elif ch == '\\':
                # literal character
                pos += 1
                ch = s[pos]
                ch = charset.special_chars.get (ch, ch)
                exp.append (charset.make_single_charset (ch))
            elif ch == '.':
                # dot
                exp.append (charset.DOT)
            else:
                exp.append (charset.make_single_charset (ch))
            pos += 1

if __name__ == '__main__':
    #print parse ('([A-Za-z][A-Za-z0-9_]*')
    #print parse (r'"([^\\\n"]|(\\.))*"')
    augassign = '|'.join (r'\+= -= \*= /= %= &= \|= \^= <<= >>= \*\*= //='.split())
    augassign = ('(' + '|'.join (r'\+ - \* / % & \| \^ << >> \*\* //'.split()) + ')=')
    print parse (augassign)