# -*- Mode: Python -*- import operator special_chars = {'r':'\r', 'n':'\n', 't':'\t'} is_a = isinstance def compact_chr (n): "helper utility for printing characters" ch = chr(n) if len(repr(ch)) == 3: return ch else: return repr(ch)[1:-1] class charset: def __init__ (self, set, bits=8): self.bits = bits self.max_char = 2 ** self.bits assert (len(set) == self.max_char) self.set = tuple (set) self.num_set = reduce (operator.add, set) self.repr_cache = None def as_string (self): return ''.join ([str(x) for x in self.set]) def as_ranges (self): "describe a charset compactly as a set of ranges" count = reduce (operator.add, self.set) ranges = [] if count == self.max_char: # it's the whole set, aka DOT return 0, [(0, 255)] elif count > (self.max_char / 2): # more set than not set, print as an inverse invert = 1 else: invert = 0 on = invert ^ self.set[0] if on: start = 0 for i in range (1,self.max_char + 1): if i == self.max_char: # pretend there's an 'off' bit at the end this_bit_on = 0 else: this_bit_on = invert ^ self.set[i] if on: # still on? if this_bit_on: # yup pass else: ranges.append ((start, i-1)) on = 0 else: # still off? if this_bit_on: # nope start = i on = 1 else: # yup pass return invert, ranges def compute_repr (self): "render a character set, in the expected manner" invert, ranges = self.as_ranges() if ranges == [(0,255)]: return '.' if invert: r = ["[^"] else: r = ["["] for start, end in ranges: if start == end: # one character r.append (compact_chr (start)) elif start + 1 == end: # two characters r.append (compact_chr (start)) r.append (compact_chr (end)) else: # an actual 'range' r.append ('%s-%s' % (compact_chr (start), compact_chr (end))) #count = reduce (operator.add, self.set) #r.append ('].%d' % (count,)) r.append (']') return ''.join (r) def __repr__ (self): if self.repr_cache is None: self.repr_cache = self.compute_repr() #if self.bits != 8: # return '%d' % (self.bits,) + self.repr_cache #else: return self.repr_cache def __hash__ (self): return hash (self.set) def __cmp__ (self, other): if isinstance (other, charset): return cmp (self.set, other.set) else: return 1 def __getitem__ (self, index): return self.set[index] def has (self, ch): return self.set[ord(ch)] def __add__ (self, other): assert isinstance (other, charset) new_set = [ (self.set[i] or other.set[i]) for i in range (self.max_char)] return make_charset (new_set, self.bits) def overlap (self, other): "is the union of and non-empty?" #W ('overlap (%r, %r)\n' % (self, other)) if other is self: return 1 elif isinstance (other, charset): for i in range (self.max_char): if other.set[i] and self.set[i]: return 1 return 0 else: return 0 def insensitive (self): "return a case-insensitive version of this charset" import string set = list(self.set[:]) # trying not to make too many ascii assumptions. # as a pre-processing step, an insensitive regexp is # downcased - so we need only copy the status of the lowercase # letters to the uppercase letters. for i in range (len (string.lowercase)): ch_lo = ord(string.lowercase[i]) ch_hi = ord(string.uppercase[i]) set[ch_hi] = set[ch_lo] return make_charset (set, self.bits) cache = {} def make_charset (set, bits=8): "make a charset, using the cache if possible" set = tuple (set) if not cache.has_key (set): cache[set] = charset (set, bits) return cache[set] def make_single_charset (ch, bits=8): "make a charset consisting of a single character" assert (type(ch) is type('') and len(ch) == 1) set = [0] * (2 ** bits) set[ord(ch)] = 1 return make_charset (set, bits) def parse_charset (s, pos=1): "parse a charset definition (e.g., '[A-Za-z0-9]' or '[^0-9])" set = [0] * 256 invert = 0 if s[pos:pos+1] == '^': # negate invert = 1 pos += 1 elif s[pos:pos+1] == '-': # set the minus character set[ord(s[pos:pos+1])] = 1 pos += 1 while s[pos] != ']': if s[pos] == '-': start = ord(s[pos-1]) end = ord(s[pos+1]) assert (start < end) for i in range(start,end+1): set[i] = 1 pos += 2 elif s[pos] == '\\': # escape ch = s[pos+1] ch = special_chars.get (ch, ch) set[ord(ch)] = 1 pos += 2 else: set[ord(s[pos])] = 1 pos += 1 if invert: return make_charset ([ not x for x in set ]), pos else: return make_charset (set), pos DOT = make_charset ([1]*256)