# -*- Mode: Python -*-
import operator
special_chars = {'r':'\r', 'n':'\n', 't':'\t'}
is_a = isinstance
def compact_chr (n):
"helper utility for printing characters"
ch = chr(n)
if len(repr(ch)) == 3:
return ch
else:
return repr(ch)[1:-1]
class charset:
def __init__ (self, set, bits=8):
self.bits = bits
self.max_char = 2 ** self.bits
assert (len(set) == self.max_char)
self.set = tuple (set)
self.num_set = reduce (operator.add, set)
self.repr_cache = None
def as_string (self):
return ''.join ([str(x) for x in self.set])
def as_ranges (self):
"describe a charset compactly as a set of ranges"
count = reduce (operator.add, self.set)
ranges = []
if count == self.max_char:
# it's the whole set, aka DOT
return 0, [(0, 255)]
elif count > (self.max_char / 2):
# more set than not set, print as an inverse
invert = 1
else:
invert = 0
on = invert ^ self.set[0]
if on:
start = 0
for i in range (1,self.max_char + 1):
if i == self.max_char:
# pretend there's an 'off' bit at the end
this_bit_on = 0
else:
this_bit_on = invert ^ self.set[i]
if on:
# still on?
if this_bit_on:
# yup
pass
else:
ranges.append ((start, i-1))
on = 0
else:
# still off?
if this_bit_on:
# nope
start = i
on = 1
else:
# yup
pass
return invert, ranges
def compute_repr (self):
"render a character set, in the expected manner"
invert, ranges = self.as_ranges()
if ranges == [(0,255)]:
return '.'
if invert:
r = ["[^"]
else:
r = ["["]
for start, end in ranges:
if start == end:
# one character
r.append (compact_chr (start))
elif start + 1 == end:
# two characters
r.append (compact_chr (start))
r.append (compact_chr (end))
else:
# an actual 'range'
r.append ('%s-%s' % (compact_chr (start), compact_chr (end)))
#count = reduce (operator.add, self.set)
#r.append ('].%d' % (count,))
r.append (']')
return ''.join (r)
def __repr__ (self):
if self.repr_cache is None:
self.repr_cache = self.compute_repr()
#if self.bits != 8:
# return '%d' % (self.bits,) + self.repr_cache
#else:
return self.repr_cache
def __hash__ (self):
return hash (self.set)
def __cmp__ (self, other):
if isinstance (other, charset):
return cmp (self.set, other.set)
else:
return 1
def __getitem__ (self, index):
return self.set[index]
def has (self, ch):
return self.set[ord(ch)]
def __add__ (self, other):
assert isinstance (other, charset)
new_set = [ (self.set[i] or other.set[i]) for i in range (self.max_char)]
return make_charset (new_set, self.bits)
def overlap (self, other):
"is the union of <self> and <other> non-empty?"
#W ('overlap (%r, %r)\n' % (self, other))
if other is self:
return 1
elif isinstance (other, charset):
for i in range (self.max_char):
if other.set[i] and self.set[i]:
return 1
return 0
else:
return 0
def insensitive (self):
"return a case-insensitive version of this charset"
import string
set = list(self.set[:])
# trying not to make too many ascii assumptions.
# as a pre-processing step, an insensitive regexp is
# downcased - so we need only copy the status of the lowercase
# letters to the uppercase letters.
for i in range (len (string.lowercase)):
ch_lo = ord(string.lowercase[i])
ch_hi = ord(string.uppercase[i])
set[ch_hi] = set[ch_lo]
return make_charset (set, self.bits)
cache = {}
def make_charset (set, bits=8):
"make a charset, using the cache if possible"
set = tuple (set)
if not cache.has_key (set):
cache[set] = charset (set, bits)
return cache[set]
def make_single_charset (ch, bits=8):
"make a charset consisting of a single character"
assert (type(ch) is type('') and len(ch) == 1)
set = [0] * (2 ** bits)
set[ord(ch)] = 1
return make_charset (set, bits)
def parse_charset (s, pos=1):
"parse a charset definition (e.g., '[A-Za-z0-9]' or '[^0-9])"
set = [0] * 256
invert = 0
if s[pos:pos+1] == '^':
# negate
invert = 1
pos += 1
elif s[pos:pos+1] == '-':
# set the minus character
set[ord(s[pos:pos+1])] = 1
pos += 1
while s[pos] != ']':
if s[pos] == '-':
start = ord(s[pos-1])
end = ord(s[pos+1])
assert (start < end)
for i in range(start,end+1):
set[i] = 1
pos += 2
elif s[pos] == '\\':
# escape
ch = s[pos+1]
ch = special_chars.get (ch, ch)
set[ord(ch)] = 1
pos += 2
else:
set[ord(s[pos])] = 1
pos += 1
if invert:
return make_charset ([ not x for x in set ]), pos
else:
return make_charset (set), pos
DOT = make_charset ([1]*256)