# -*- Mode: Python; tab-width: 4 -*- # $Id: parsdate.py,v 1.3 1996/04/11 06:58:40 rushing Exp $ # Parse various date headers you might find in mail or news, including # some common violations of rfc822 # # Author: Sam Rushing (rushing@nightmare.com) # # Much of this is a translation of timezone.el from GNU Emacs, # in particular timezone.el's regexps are taken without modification. # Many thanks to Masanobu Umeda, Rich Salz and anyone else who's # contributed to timezone.el or parsedate.y # I've tested this with ~10,000 date headers from my own mail, # and it matches parsedate.y on all that can be parsed by both. # This module can handle several formats that parsedate.y can't. import regex import string import time error = "parsedate error" short_days = '\\(mon\\|tue\\|wed\\|thu\\|fri\\|sat\\|sun\\)' long_days = '\\(monday\\|tuesday\\|wednesday\\|thursday\\|friday\\|saturday\\|sunday\\)' # strip off extra leading whitespace, and the day of the week dow_stripper = regex.compile ( '[ \t]*\\(\\(%s\\|%s\\),?[ \t]*\\)?' % (long_days, short_days), regex.casefold ) # from timezone.el, the function timezone-parse-date # Understands the following styles: # (1) 14 Apr 89 03:20[:12] [GMT] # (2) Fri, 17 Mar 89 4:01[:33] [GMT] # (3) Mon Jan 16 16:12[:37] [GMT] 1989 # (4) 6 May 1992 1641-JST (Wednesday) # (5) 22-AUG-1993 10:59:12.82" # seems I have to strip the dow off, first (SMR) # RFC850 lives on, in HTTP. # (6) Weekday, DD-Mon-YY HH:MM:SS TIMEZONE # regular expressions are your friends! date_regex_list =[ \ # Styles: (1) and (2) without timezone (regex.compile ("\\([0-9]+\\)[ \t]+\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]*\\'"), (3,2,1,4,None)), # Styles: (1) and (2) with timezone and buggy timezone (regex.compile ("\\([0-9]+\\)[ \t]+\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]*\\([-+a-zA-Z0-9]+\\)"), (3,2,1,4,5)), # Styles: (3) without timezone (regex.compile ("\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]+\\([0-9]+\\)"), (4,1,2,3,None)), # Styles: (3) with timezone (regex.compile ("\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9:]+\\)[ \t]+\\([-+a-zA-Z0-9]+\\)[ \t]+\\([0-9]+\\)"), (5,1,2,3,4)), # Styles: (4) with timezone (regex.compile ("\\([0-9]+\\)[ \t]+\\([^ \t,]+\\)[ \t]+\\([0-9]+\\)[ \t]+\\([0-9]+\\)[ \t]*\\([-+a-zA-Z0-9]+\\)"), (3,2,1,4,5)), # Styles: (5) without timezone. (regex.compile ("\\([0-9]+\\)-\\([A-Za-z]+\\)-\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9]+:[0-9]+\\)\\.[0-9]+"), (3,2,1,4,None)), # Styles: (6) with timezone. (regex.compile ("\\([0-9]+\\)-\\([A-Za-z]+\\)-\\([0-9]+\\)[ \t]+\\([0-9]+:[0-9]+:[0-9]+\\)[ \t]*\\([-+a-zA-Z0-9]+\\)"), (3,2,1,4,5)) ] # identify the various parts of a date field, # return (year, month, day, time, timezone) def partition_date (date): skip = dow_stripper.match (date) if skip != -1: date = date[skip:] for reg, idx in date_regex_list: if reg.match(date) == len(date): if idx[4] == None: tz = None else: tz = reg.group(idx[4]) return (reg.group(idx[0]), reg.group(idx[1]), reg.group(idx[2]), reg.group(idx[3]), tz) raise error, "couldn't partition date" # format is (regex, seconds_present) time_regex_list = [ \ # HH:MM:SS (regex.compile("\\`\\([0-9]+\\):\\([0-9]+\\):\\([0-9]+\\)\\'"), 1), # HH:MM (regex.compile("\\`\\([0-9]+\\):\\([0-9]+\\)\\'"), 0), # HHMMSS (regex.compile("\\`\\([0-9][0-9]\\)\\([0-9][0-9]\\)\\([0-9][0-9]\\)\\'"), 1), # HHMM (regex.compile("\\`\\([0-9][0-9]\\)\\([0-9][0-9]\\)\\'"), 0) ] # return integer triplet of hh, mm, ss def parse_time (time): for reg, secondp in time_regex_list: if reg.match (time) != -1: if secondp == 1: return (string.atoi(reg.group(1)), string.atoi(reg.group(2)), string.atoi(reg.group(3))) else: return (string.atoi(reg.group(1)), string.atoi(reg.group(2)), 0) raise error, "couldn't partition time" # this table is based on the one in 'parsedate.y', from the INN 1.4 distribution. zone_map = { \ "gmt": ( 0, 0, 0), # Greenwich Mean "ut": ( 0, 0, 0), # Universal "utc": ( 0, 0, 0), # Universal Coordinated "cut": ( 0, 0, 0), # Coordinated Universal "z": ( 0, 0, 0), # Greenwich Mean "wet": ( 0, 0, 0), # Western European "bst": (-1, 0, 0), # British Summer "nst": ( 0,-3,30), # Newfoundland Standard "ndt": (-1,-3,30), # Newfoundland Daylight "ast": ( 0,-4, 0), # Atlantic Standard "adt": (-1,-4, 0), # Atlantic Daylight "est": (0, -5, 0), # Eastern Standard "edt": (-1,-5, 0), # Eastern Daylight "cst": ( 0,-6, 0), # Central Standard "cdt": (-1,-6, 0), # Central Daylight "mst": ( 0,-7, 0), # Mountain Standard "mdt": (-1,-7, 0), # Mountain Daylight "pst": ( 0,-8, 0), # Pacific Standard "pdt": (-1,-8, 0), # Pacific Daylight "yst": ( 0,-9, 0), # Yukon Standard "ydt": (-1,-9, 0), # Yukon Daylight "akst": ( 0,-9, 0), # Alaska Standard "akdt": (-1,-9, 0), # Alaska Daylight "hst": ( 0,-10,0), # Hawaii Standard "hast": ( 0,-10,0), # Hawaii-Aleutian Standard "hadt": (-1,-10,0), # Hawaii-Aleutian Daylight "ces": (-1, 1, 0), # Central European Summer "cest": (-1, 1, 0), # Central European Summer "mez": ( 0, 1, 0), # Middle European "mezt": (-1, 1, 0), # Middle European Summer "cet": ( 0, 1, 0), # Central European "met": ( 0, 1, 0), # Middle European "eet": ( 0, 2, 0), # Eastern Europe "msk": ( 0, 3, 0), # Moscow Winter "msd": (-1, 3, 0), # Moscow Summer "wast": ( 0, 8, 0), # West Australian Standard "wadt": (-1, 8, 0), # West Australian Daylight "hkt": ( 0, 8, 0), # Hong Kong "cct": ( 0, 8, 0), # China Coast "jst": ( 0, 9, 0), # Japan Standard "kst": ( 0, 9, 0), # Korean Standard "kdt": ( 0, 9, 0), # Korean Daylight "cast": ( 0, 9,30), # Central Australian Standard "cadt": (-1, 9,30), # Central Australian Daylight "east": ( 0,10, 0), # Eastern Australian Standard "eadt": (-1,10, 0), # Eastern Australian Daylight "nzst": ( 0,12, 0), # New Zealand Standard "nzdt": (-1,12, 0) # New Zealand Daylight } # recognizes numeric timezone offsets. numeric_timezone_reg = regex.compile('[+-]?[0-9]?[0-9][0-9][0-9]') # tzn[+|-]hh[:mm[:ss]][dzn] # tzn := 3-letter tzname # dzn := 3-letter daylight zone name # group 1 holds the hh[:mm[:ss]] string. symbolic_timezone_reg = \ regex.compile("[a-zA-Z][a-zA-Z][a-zA-Z]"+ "\\(\\([+-]?[0-9]+\\)\\(:[0-9]+\\(:[0-9]+\\)?\\)?\\)"+ "\\([a-zA-Z][a-zA-Z][a-zA-Z]\\)?") # return the offset, in minutes, of this timezone def parse_timezone (tz): # this is the preferred format: -0500, +0930 if numeric_timezone_reg.match (tz) != -1: min = string.atoi (tz[-2:]) hour = string.atoi (tz[:-2]) return (3600 * hour) + 60 * min tz = string.lower (tz) # semi-standard timezone name: EST, PST, MET # use table lookup if zone_map.has_key (tz): dstoff,hour,min = zone_map[tz] return (3600 * -dstoff) + (3600 * hour) + 60* min # TZ environment variable format, # not legal in rfc822 or rfc1036, but it shows # up occasionally. # note that the numeric offset is negative if symbolic_timezone_reg.match (tz) != -1: hms = map (string.atoi, string.splitfields (symbolic_timezone_reg.group(1), ':')) factor = 3600 offset = 0 for x in hms: offset = offset + factor * x factor = factor / 60 return -offset raise error, "couldn't parse timezone" month_table = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] def parsedate (date): yy,mm,dd,tt,tz = partition_date (date) if len(yy) == 2: yy = string.atoi('19'+yy) else: yy = string.atoi(yy) mm = string.lower (mm) if not mm in month_table: raise error, "bogus month name" else: mm = month_table.index (mm)+1 dd = string.atoi (dd) tt = parse_time (tt) if tz: try: tzoff = parse_timezone (tz) except: print "couldn't parse timezone" tzoff = None else: tzoff = None # time.timezone is thrown in because mktime is relative to the local tz if tzoff != None: return time.mktime((yy,mm,dd,tt[0],tt[1],tt[2],0,0,0))-(time.timezone+tzoff), (yy,mm,dd,tt,tz) # the final -1 arg means 'let the system decide if DST applies for this date' else: return time.mktime((yy,mm,dd,tt[0],tt[1],tt[2],0,0,-1)), (yy,mm,dd,tt,tz) # def test(): # import parsdate # fd = open('f:/tmp/dates.txt', 'r') # while 1: # line = fd.readline() # if line == '': # break # line = line[6:-1] # try: # date1 = parsedate (line) # except error, why: # print why, line # break # date2 = parsdate.parsedate (line) # if date2 == -1: # print 'parsdate parse failed on '+line # else: # if date1 != date2: # print 'difference: '+line # print '.' # # dates seen but not yet handled (violators of rfc822 in a bad way) # (from amdahl.com) 'Friday, 7 January 1994 07:03 PT' # (from psi.com) dates with a TZ of 'U'... the code punts to GMT.