With help from Martin v. Loewis, clarification is added for the

semantics of header chunks using byte and Unicode strings.
Specifically,

append(): When the given string is a byte string, charset (whether
specified explicitly in the argument list or implicitly via the
constructor default) is the encoding of the byte string, and a
UnicodeError will be raised if the string cannot be decoded with that
charset.  If s is a Unicode string, then charset is a hint specifying
the character set of the characters in the string.  In this case, when
producing an RFC 2822 compliant header using RFC 2047 rules, the
Unicode string will be encoded using the following charsets in order:
us-ascii, the charset hint, utf-8.

__init__(): Use the global USASCII Charset instance when the charset
argument is None.  Also, clarification in the docstring.

Also, use True/False where appropriate.
This commit is contained in:
Barry Warsaw 2002-09-30 15:51:31 +00:00
parent 884554dfe5
commit 174aa49a88

View file

@ -1,9 +1,11 @@
# Copyright (C) 2002 Python Software Foundation
# Author: che@debian.org (Ben Gertzfield)
# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
"""Header encoding and decoding functionality."""
import re
from types import StringType, UnicodeType
import email.quopriMIME
import email.base64MIME
from email.Charset import Charset
@ -14,6 +16,12 @@
# Python 2.1 spells integer division differently
from email._compat21 import _floordiv
try:
True, False
except NameError:
True = 1
False = 0
CRLFSPACE = '\r\n '
CRLF = '\r\n'
NL = '\n'
@ -25,6 +33,9 @@
ENCODE = 1
DECODE = 2
USASCII = Charset('us-ascii')
UTF8 = Charset('utf-8')
# Match encoded-word strings in the form =?charset?q?Hello_World?=
ecre = re.compile(r'''
=\? # literal =?
@ -117,21 +128,19 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None,
class Header:
def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
continuation_ws=' '):
"""Create a MIME-compliant header that can contain many languages.
"""Create a MIME-compliant header that can contain many character sets.
Specify the initial header value in s. If None, the initial header
value is not set.
Optional s is the initial header value. If None, the initial header
value is not set. You can later append to the header with .append()
method calls. s may be a byte string or a Unicode string, but see the
.append() documentation for semantics.
Specify both s's character set, and the default character set by
setting the charset argument to a Charset object (not a character set
name string!). If None, a us-ascii Charset is used as both s's
initial charset and as the default character set for subsequent
.append() calls.
You can later append to the header with append(s, charset) below;
charset does not have to be the same as the one initially specified
here. In fact, it's optional, and if not given, defaults to the
charset specified in the constructor.
Optional charset serves two purposes: it has the same meaning as the
charset argument to the .append() method. It also sets the default
character set for all subsequent .append() calls that omit the charset
argument. If charset is not provided in the constructor, the us-ascii
charset is used both as s's initial charset and as the default for
subsequent .append() calls.
The maximum line length can be specified explicit via maxlinelen. For
splitting the first line to a shorter value (to account for the field
@ -143,7 +152,7 @@ def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
lines.
"""
if charset is None:
charset = Charset()
charset = USASCII
self._charset = charset
self._continuation_ws = continuation_ws
cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
@ -186,20 +195,43 @@ def __ne__(self, other):
return not self == other
def append(self, s, charset=None):
"""Append string s with Charset charset to the MIME header.
"""Append a string to the MIME header.
If charset is given, it should be a Charset instance, or the name of a
character set (which will be converted to a Charset instance). A
value of None (the default) means charset is the one given in the
class constructor.
Optional charset, if given, should be a Charset instance or the name
of a character set (which will be converted to a Charset instance). A
value of None (the default) means that the charset given in the
constructor is used.
s may be a byte string or a Unicode string. If it is a byte string
(i.e. isinstance(s, StringType) is true), then charset is the encoding
of that byte string, and a UnicodeError will be raised if the string
cannot be decoded with that charset. If `s' is a Unicode string, then
charset is a hint specifying the character set of the characters in
the string. In this case, when producing an RFC 2822 compliant header
using RFC 2047 rules, the Unicode string will be encoded using the
following charsets in order: us-ascii, the charset hint, utf-8.
"""
if charset is None:
charset = self._charset
elif not isinstance(charset, Charset):
charset = Charset(charset)
# Normalize and check the string
if isinstance(s, StringType):
# Possibly raise UnicodeError if it can't e encoded
unicode(s, charset.get_output_charset())
elif isinstance(s, UnicodeType):
# Convert Unicode to byte string for later concatenation
for charset in USASCII, charset, UTF8:
try:
s = s.encode(charset.get_output_charset())
break
except UnicodeError:
pass
else:
assert False, 'Could not encode to utf-8'
self._chunks.append((s, charset))
def _split(self, s, charset, firstline=0):
def _split(self, s, charset, firstline=False):
# Split up a header safely for use with encode_chunks. BAW: this
# appears to be a private convenience method.
splittable = charset.to_splittable(s)
@ -227,13 +259,13 @@ def _split(self, s, charset, firstline=0):
# We can split on _maxlinelen boundaries because we know that the
# encoding won't change the size of the string
splitpnt = self._maxlinelen
first = charset.from_splittable(splittable[:splitpnt], 0)
last = charset.from_splittable(splittable[splitpnt:], 0)
first = charset.from_splittable(splittable[:splitpnt], False)
last = charset.from_splittable(splittable[splitpnt:], False)
else:
# Divide and conquer.
halfway = _floordiv(len(splittable), 2)
first = charset.from_splittable(splittable[:halfway], 0)
last = charset.from_splittable(splittable[halfway:], 0)
first = charset.from_splittable(splittable[:halfway], False)
last = charset.from_splittable(splittable[halfway:], False)
# Do the split
return self._split(first, charset, firstline) + \
self._split(last, charset)
@ -248,7 +280,7 @@ def _ascii_split(self, s, charset, firstline):
line = lines.pop(0)
if firstline:
maxlinelen = self._firstlinelen
firstline = 0
firstline = False
else:
#line = line.lstrip()
maxlinelen = self._maxlinelen
@ -338,7 +370,7 @@ def _encode_chunks(self):
# There's no encoding for this chunk's charsets
_max_append(chunks, header, self._maxlinelen)
else:
_max_append(chunks, charset.header_encode(header, 0),
_max_append(chunks, charset.header_encode(header),
self._maxlinelen, ' ')
joiner = NL + self._continuation_ws
return joiner.join(chunks)
@ -363,6 +395,6 @@ def encode(self):
"""
newchunks = []
for s, charset in self._chunks:
newchunks += self._split(s, charset, 1)
newchunks += self._split(s, charset, True)
self._chunks = newchunks
return self._encode_chunks()