1 """RFC 2822 message manipulation.
3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
4 the tokenizing of addresses does not adhere to all the quoting rules.
6 Note: RFC 2822 is a long awaited update to RFC 822. This module should
7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some
8 effort at RFC 2822 updates have been made, but a thorough audit has not been
9 performed. Consider any RFC 2822 non-conformance to be a bug.
11 RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
12 RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
16 To create a Message object: first open a file, e.g.:
20 You can use any other legal way of getting an open file object, e.g. use
21 sys.stdin or call os.popen(). Then
pass the open file object to the
Message()
26 This
class can work with
any input object that supports
a readline method. If
27 the input object has seek
and tell capability, the rewindbody method will
28 work; also illegal lines will be pushed back onto the input stream. If the
29 input object lacks seek but has an `unread
' method that can push back a line
30 of input, Message will use that to push back illegal lines. Thus this class
31 can be used to parse messages coming
from a buffered stream.
33 The optional `seekable
' argument is provided as a workaround for certain stdio
34 libraries in which
tell() discards buffered data before discovering that the
35 lseek() system call doesn
't work. For maximum portability, you should set the
36 seekable argument to zero to prevent that initial \code{tell} when passing in
37 an unseekable object such
as a a file object created
from a socket object. If
38 it
is 1 on entry -- which it
is by default -- the
tell() method of the open
39 file object
is called once;
if this raises an exception, seekable
is reset to
40 0. For other nonzero values of seekable, this test
is not made.
42 To get the text of a particular header there are several methods:
44 str = m.getheader(name)
45 str = m.getrawheader(name)
47 where name
is the name of the header, e.g.
'Subject'. The difference
is that
48 getheader() strips the leading
and trailing whitespace,
while getrawheader()
49 doesn
't. Both functions retain embedded whitespace (including newlines)
50 exactly as they are specified
in the header,
and leave the case of the text
53 For addresses
and address lists there are functions
55 realname, mailaddress = m.getaddr(name)
56 list = m.getaddrlist(name)
58 where the latter returns a list of (realname, mailaddr) tuples.
60 There
is also a method
62 time = m.getdate(name)
64 which parses a Date-like field
and returns a time-compatible tuple,
65 i.e. a tuple such
as returned by time.localtime()
or accepted by
68 See the
class definition for
lower level access methods.
70 There are also some utility functions here.
72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
76 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
78 _blanklines = ('\r\n', '\n') # Optimization for islast()
82 """Represents a single RFC 2822-compliant message.
"""
84 def __init__(self, fp, seekable = 1):
85 """Initialize the
class instance and read the
headers.
"""
87 # Exercise tell() to make sure it works
88 # (and then assume seek() works, too)
91 except (AttributeError, IOError):
96 self.seekable = seekable
97 self.startofheaders = None
98 self.startofbody = None
102 self.startofheaders = self.fp.tell()
110 self.startofbody = self.fp.tell()
114 def rewindbody(self):
115 """Rewind the file to the start of the body (
if seekable).
"""
116 if not self.seekable:
117 raise IOError, "unseekable file"
118 self.fp.seek(self.startofbody)
120 def readheaders(self):
121 """Read header lines.
123 Read header lines up to the entirely blank line that terminates them.
124 The (normally blank) line that ends the headers
is skipped, but
not
125 included
in the returned list. If a non-header line ends the headers,
126 (which
is an error), an attempt
is made to backspace over it; it
is
127 never included
in the returned list.
129 The variable self.
status is set to the empty string
if all went well,
130 otherwise it
is an error message. The variable self.
headers is a
131 completely uninterpreted list of lines contained
in the header (so
132 printing them will reproduce the header exactly
as it appears
in the
137 self.headers = list = []
141 startofline = unread = tell = None
142 if hasattr(self.fp, 'unread'):
143 unread = self.fp.unread
151 startofline = tell = None
153 line = self.fp.readline()
155 self.status = 'EOF in headers'
157 # Skip unix From name time lines
158 if firstline and line.startswith('From '):
159 self.unixfrom = self.unixfrom + line
162 if headerseen and line[0] in ' \t':
163 # It's a continuation line.
165 x = (self.dict[headerseen] + "\n " + line.strip())
166 self.dict[headerseen] = x.strip()
168 elif self.iscomment(line):
169 # It's a comment. Ignore it.
171 elif self.islast(line):
172 # Note! No pushback here! The delimiter line gets eaten.
174 headerseen = self.isheader(line)
176 # It's a legal header line, save it.
178 self.dict[headerseen] = line[len(headerseen)+1:].strip()
181 # It's not a header line; throw it back and stop here.
183 self.status = 'No headers'
185 self.status = 'Non-header line where header expected'
186 # Try to undo the read.
190 self.fp.seek(startofline)
192 self.status = self.status + '; bad seek'
195 def isheader(self, line):
196 """Determine whether a given line
is a legal header.
198 This method should
return the header name, suitably canonicalized.
199 You may override this method
in order to use Message parsing on tagged
200 data
in RFC 2822-like formats with special header formats.
204 return line[:i].lower()
208 def islast(self, line):
209 """Determine whether a line
is a legal end of RFC 2822 headers.
211 You may override this method
if your application wants to bend the
212 rules, e.g. to strip trailing whitespace,
or to recognize MH template
213 separators (
'--------'). For convenience (e.g.
for code reading
from
214 sockets) a line consisting of \r\n also matches.
216 return line in _blanklines
218 def iscomment(self, line):
219 """Determine whether a line should be skipped entirely.
221 You may override this method
in order to use Message parsing on tagged
222 data
in RFC 2822-like formats that support embedded comments
or
227 def getallmatchingheaders(self, name):
228 """Find all header lines matching a given header name.
230 Look through the list of headers
and find all lines matching a given
231 header name (
and their continuation lines). A list of the lines
is
232 returned, without interpretation. If the header does
not occur, an
233 empty list
is returned. If the header occurs multiple times, all
234 occurrences are returned. Case
is not important
in the header name.
236 name = name.lower() + ':'
240 for line in self.headers:
241 if line[:n].lower() == name:
243 elif not line[:1].isspace():
249 def getfirstmatchingheader(self, name):
250 """Get the first header line matching name.
252 This
is similar to getallmatchingheaders, but it returns only the
253 first matching header (
and its continuation lines).
255 name = name.lower() + ':'
259 for line in self.headers:
261 if not line[:1].isspace():
263 elif line[:n].lower() == name:
269 def getrawheader(self, name):
272 Return a string containing the literal text of the header but with the
273 keyword stripped. All leading, trailing
and embedded whitespace
is
274 kept
in the string, however. Return
None if the header does
not
278 list = self.getfirstmatchingheader(name)
281 list[0] = list[0][len(name) + 1:]
284 def getheader(self, name, default=None):
285 """Get the header value
for a name.
287 This
is the normal interface: it returns a stripped version of the
288 header value
for a given header name,
or None if it doesn
't exist.
289 This uses the dictionary version which finds the *last* such header.
292 return self.dict[name.lower()]
297 def getheaders(self, name):
298 """Get all values
for a header.
300 This returns a list of values
for headers given more than once; each
301 value
in the result list
is stripped
in the same way
as the result of
302 getheader(). If the header
is not given,
return an empty list.
307 for s in self.getallmatchingheaders(name):
310 current = "%s\n %s" % (current, s.strip())
315 result.append(current)
316 current = s[s.find(":") + 1:].strip()
319 result.append(current)
322 def getaddr(self, name):
323 """Get a single address
from a header,
as a tuple.
325 An example
return value:
326 (
'Guido van Rossum',
'guido@cwi.nl')
329 alist = self.getaddrlist(name)
335 def getaddrlist(self, name):
336 """Get a list of addresses
from a header.
338 Retrieves a list of addresses
from a header, where each address
is a
339 tuple
as returned by
getaddr(). Scans all named headers, so it works
340 properly with multiple To:
or Cc: headers
for example.
343 for h in self.getallmatchingheaders(name):
353 alladdrs = ''.join(raw)
354 a = AddrlistClass(alladdrs)
355 return a.getaddrlist()
357 def getdate(self, name):
358 """Retrieve a date field
from a header.
360 Retrieves a date field
from the named header, returning a tuple
361 compatible with time.mktime().
367 return parsedate(data)
369 def getdate_tz(self, name):
370 """Retrieve a date field
from a header
as a 10-tuple.
372 The first 9 elements make up a tuple compatible with time.mktime(),
373 and the 10th
is the offset of the poster
's time zone from GMT/UTC.
379 return parsedate_tz(data)
382 # Access as a dictionary (only finds *last* header of each type):
385 """Get the number of headers
in a message.
"""
386 return len(self.dict)
388 def __getitem__(self, name):
389 """Get a specific header,
as from a dictionary.
"""
390 return self.dict[name.lower()]
392 def __setitem__(self, name, value):
393 """Set the value of a header.
395 Note: This
is not a perfect inversion of __getitem__, because any
396 changed headers get stuck at the end of the raw-headers list rather
397 than where the altered header was.
399 del self[name] # Won't fail if it doesn't exist
400 self.dict[name.lower()] = value
401 text = name + ": " + value
402 lines = text.split("\n")
404 self.headers.append(line + "\n")
406 def __delitem__(self, name):
407 """Delete all occurrences of a specific header,
if it
is present.
"""
409 if not self.dict.has_key(name):
416 for i in range(len(self.headers)):
417 line = self.headers[i]
418 if line[:n].lower() == name:
420 elif not line[:1].isspace():
428 def get(self, name, default=""):
430 if self.dict.has_key(name):
431 return self.dict[name]
435 def setdefault(self, name, default=""):
436 lowername = name.lower()
437 if self.dict.has_key(lowername):
438 return self.dict[lowername]
440 text = name + ": " + default
441 lines = text.split("\n")
443 self.headers.append(line + "\n")
444 self.dict[lowername] = default
447 def has_key(self, name):
448 """Determine whether a message contains the named header.
"""
449 return self.dict.has_key(name.lower())
452 """Get all of a message
's header field names."""
453 return self.dict.keys()
456 """Get all of a message's header field values."""
457 return self.dict.values()
460 """Get all of a message's headers.
462 Returns a list of name, value tuples.
464 return self.dict.items()
481 """Remove quotes from a string."""
483 if str[0] ==
'"' and str[-1:] ==
'"':
485 if str[0] ==
'<' and str[-1:] ==
'>':
491 """Add quotes around a string."""
492 return str.replace(
'\\',
'\\\\').
replace(
'"',
'\\"')
496 """Parse an address into a (realname, mailaddr) tuple."""
498 list = a.getaddrlist()
506 """Address parser class by Ben Escoto.
508 To understand what this class does, it helps to have a copy of
509 RFC 2822 in front of you.
511 http://www.faqs.org/rfcs/rfc2822.html
513 Note: this class interface is deprecated and may be removed in the future.
514 Use rfc822.AddressList instead.
518 """Initialize a new instance.
520 `field' is an unparsed address header field, containing one or more
536 """Parse up to the start of the next address."""
545 """Parse all addresses.
547 Returns a list containing all of the addresses.
559 """Parse the next address."""
587 fieldlen = len(self.
field)
591 if self.
pos < fieldlen
and self.
field[self.
pos] ==
';':
601 returnlist = [(
' '.
join(plist) +
' (' + \
603 else: returnlist = [(
' '.
join(plist), routeaddr)]
617 """Parse a route address (Return-path value).
619 This method just skips all the route stuff and returns the addrspec.
649 """Parse an RFC 2822 addr-spec."""
658 aslist.append(
'"%s"' % self.
getquote())
661 else: aslist.append(self.
getatom())
665 return ''.
join(aslist)
673 """Get the complete domain name from an address."""
687 else: sdlist.append(self.
getatom())
688 return ''.
join(sdlist)
691 """Parse a header fragment delimited by special characters.
693 `beginchar' is the start character for the fragment. If self is not
694 looking at an instance of `beginchar' then getdelimited returns the
697 `endchars' is a sequence of allowable end-delimiting characters.
698 Parsing stops when one of these is encountered.
700 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
701 within the parsed fragment.
703 if self.
field[self.
pos] != beginchar:
713 elif self.
field[self.
pos]
in endchars:
716 elif allowcomments
and self.
field[self.
pos] ==
'(':
724 return ''.
join(slist)
727 """Get a quote-delimited fragment from self's field."""
731 """Get a parenthesis-delimited fragment from self's field."""
735 """Parse an RFC 2822 domain-literal."""
739 """Parse an RFC 2822 atom.
741 Optional atomends specifies a different set of end token delimiters
742 (the default is to use self.atomends). This is used e.g. in
743 getphraselist() since phrase endings must not include the `.' (which
744 is legal in phrases)."""
750 if self.
field[self.
pos]
in atomends:
752 else: atomlist.append(self.
field[self.
pos])
755 return ''.
join(atomlist)
758 """Parse a sequence of RFC 2822 phrases.
760 A phrase is a sequence of words, which are in turn either RFC 2822
761 atoms or quoted-strings. Phrases are canonicalized by squeezing all
762 runs of continuous whitespace into one space.
781 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
783 AddrlistClass.__init__(self, field)
799 for x
in other.addresslist:
801 newaddr.addresslist.append(x)
806 for x
in other.addresslist:
808 self.addresslist.append(x)
815 if not x
in other.addresslist:
816 newaddr.addresslist.append(x)
821 for x
in other.addresslist:
823 self.addresslist.remove(x)
831 """Dump a (name, address) pair in a canonicalized form."""
833 return '"' + pair[0] +
'" <' + pair[1] +
'>'
839 _monthnames = [
'jan',
'feb',
'mar',
'apr',
'may',
'jun',
'jul',
840 'aug',
'sep',
'oct',
'nov',
'dec',
841 'january',
'february',
'march',
'april',
'may',
'june',
'july',
842 'august',
'september',
'october',
'november',
'december']
843 _daynames = [
'mon',
'tue',
'wed',
'thu',
'fri',
'sat',
'sun']
851 _timezones = {
'UT':0,
'UTC':0,
'GMT':0,
'Z':0,
852 'AST': -400,
'ADT': -300,
853 'EST': -500,
'EDT': -400,
854 'CST': -600,
'CDT': -500,
855 'MST': -700,
'MDT': -600,
856 'PST': -800,
'PDT': -700
861 """Convert a date string to a time tuple.
863 Accounts for military timezones.
868 if data[0][-1]
in (
',',
'.')
or data[0].
lower()
in _daynames:
872 stuff = data[0].
split(
'-')
874 data = stuff + data[1:]
879 data[3:] = [s[:i], s[i+1:]]
885 [dd, mm, yy, tm, tz] = data
887 if not mm
in _monthnames:
888 dd, mm = mm, dd.lower()
889 if not mm
in _monthnames:
891 mm = _monthnames.index(mm)+1
892 if mm > 12: mm = mm - 12
922 if _timezones.has_key(tz):
923 tzoffset = _timezones[tz]
936 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
937 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
942 """Convert a time string to a time tuple."""
944 if type(t) == type( () ):
950 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
953 return time.mktime(data[:8] + (-1,))
955 t = time.mktime(data[:8] + (0,))
956 return t - data[9] - time.timezone
959 """Returns time format preferred for Internet standards.
961 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
963 According to RFC 1123, day and month names must always be in
964 English. If not for that, this code could use strftime(). It
965 can't because strftime() honors the locale and could generated
969 timeval = time.time()
970 timeval = time.gmtime(timeval)
971 return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
972 [
"Mon",
"Tue",
"Wed",
"Thu",
"Fri",
"Sat",
"Sun"][timeval[6]],
974 [
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
975 "Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec"][timeval[1]-1],
976 timeval[0], timeval[3], timeval[4], timeval[5])
983 if __name__ ==
'__main__':
985 file = os.path.join(os.environ[
'HOME'],
'Mail/inbox/1')
986 if sys.argv[1:]: file = sys.argv[1]
989 print 'From:', m.getaddr(
'from')
990 print 'To:', m.getaddrlist(
'to')
991 print 'Subject:', m.getheader(
'subject')
992 print 'Date:', m.getheader(
'date')
993 date = m.getdate_tz(
'date')
997 print 'ParsedDate:', time.asctime(date),
999 hhmm, ss = divmod(hhmmss, 60)
1000 hh, mm = divmod(hhmm, 60)
1001 print "%+03d%02d" % (hh, mm),
1002 if ss:
print ".%02d" % ss,
1005 print 'ParsedDate:',
None
1012 print 'len =', len(m)
1013 if m.has_key(
'Date'):
print 'Date =', m[
'Date']
1014 if m.has_key(
'X-Nonsense'):
pass
1015 print 'keys =', m.keys()
1016 print 'values =', m.values()
1017 print 'items =', m.items()