Patch #462190, patch #464070: Support quoted printable in the binascii module.

Decode and encode underscores for header style encoding. Fixes bug #463996.
This commit is contained in:
Martin v. Löwis 2001-09-30 20:32:11 +00:00
parent 5f12d755a8
commit 16dc7f44b1
6 changed files with 391 additions and 18 deletions

View file

@ -40,6 +40,24 @@ The length of \var{data} should be at most 57 to adhere to the base64
standard.
\end{funcdesc}
\begin{funcdesc}{a2b_qp}{string\optional{, header}}
Convert a block of quoted-printable data back to binary and return the
binary data. More than one line may be passed at a time.
If the optional argument \var{header} is present and true, underscores
will be decoded as spaces.
\end{funcdesc}
\begin{funcdesc}{b2a_qp}{data\optional{, quotetabs, istext, header}}
Convert binary data to a line(s) of \ASCII{} characters in
quoted-printable encoding. The return value is the converted line(s).
If the optional argument \var{quotetabs} is present and true, all tabs
and spaces will be encoded. If the optional argument \var{header} is
present and true, spaces will be encoded as underscores per RFC1522.
If the optional argument \var{header} is present and false, newline
characters will be encoded as well, otherwise linefeed conversion might
corrupt the binary data stream.
\end{funcdesc}
\begin{funcdesc}{a2b_hqx}{string}
Convert binhex4 formatted \ASCII{} data to binary, without doing
RLE-decompression. The string should contain a complete number of
@ -118,4 +136,6 @@ again.
\seemodule{binhex}{Support for the binhex format used on the Macintosh.}
\seemodule{uu}{Support for UU encoding used on \UNIX.}
\seemodule{quopri}{Support for quoted-printable encoding used in MIME email messages. }
\end{seealso}

View file

@ -7,21 +7,27 @@
This module performs quoted-printable transport encoding and decoding,
as defined in \rfc{1521}: ``MIME (Multipurpose Internet Mail Extensions)
Part One''. The quoted-printable encoding is designed for data where
there are relatively few nonprintable characters; the base64 encoding
scheme available via the \refmodule{base64} module is more compact if there
are many such characters, as when sending a graphics file.
as defined in \rfc{1521}: ``MIME (Multipurpose Internet Mail
Extensions) Part One: Mechanisms for Specifying and Describing the
Format of Internet Message Bodies''. The quoted-printable encoding is
designed for data where there are relatively few nonprintable
characters; the base64 encoding scheme available via the
\refmodule{base64} module is more compact if there are many such
characters, as when sending a graphics file.
\indexii{quoted-printable}{encoding}
\index{MIME!quoted-printable encoding}
\begin{funcdesc}{decode}{input, output}
\begin{funcdesc}{decode}{input, output\optional{,header}}
Decode the contents of the \var{input} file and write the resulting
decoded binary data to the \var{output} file.
\var{input} and \var{output} must either be file objects or objects that
mimic the file object interface. \var{input} will be read until
\code{\var{input}.readline()} returns an empty string.
If the optional argument \var{header} is present and true, underscore
will be decoded as space. This is used to decode
``Q''-encoded headers as described in \rfc{1522}: ``MIME (Multipurpose Internet Mail Extensions)
Part Two: Message Header Extensions for Non-ASCII Text''.
\end{funcdesc}
\begin{funcdesc}{encode}{input, output, quotetabs}
@ -36,7 +42,7 @@ when false it leaves them unencoded. Note that spaces and tabs
appearing at the end of lines are always encoded, as per \rfc{1521}.
\end{funcdesc}
\begin{funcdesc}{decodestring}{s}
\begin{funcdesc}{decodestring}{s\optional{,header}}
Like \function{decode()}, except that it accepts a source string and
returns the corresponding decoded string.
\end{funcdesc}

View file

@ -11,9 +11,14 @@
HEX = '0123456789ABCDEF'
EMPTYSTRING = ''
try:
from binascii import a2b_qp, b2a_qp
except:
a2b_qp = None
b2a_qp = None
def needsquoting(c, quotetabs):
def needsquoting(c, quotetabs, header):
"""Decide whether a particular character needs to be quoted.
The 'quotetabs' flag indicates whether embedded tabs and spaces should be
@ -22,6 +27,9 @@ def needsquoting(c, quotetabs):
"""
if c in ' \t':
return quotetabs
# if header, we have to escape _ because _ is used to escape space
if c == '_':
return header
return c == ESCAPE or not (' ' <= c <= '~')
def quote(c):
@ -31,14 +39,23 @@ def quote(c):
def encode(input, output, quotetabs):
def encode(input, output, quotetabs, header = 0):
"""Read 'input', apply quoted-printable encoding, and write to 'output'.
'input' and 'output' are files with readline() and write() methods.
The 'quotetabs' flag indicates whether embedded tabs and spaces should be
quoted. Note that line-ending tabs and spaces are always encoded, as per
RFC 1521.
The 'header' flag indicates whether we are encoding spaces as _ as per
RFC 1522.
"""
if b2a_qp is not None:
data = input.read()
odata = b2a_qp(data, quotetabs = quotetabs, header = header)
output.write(odata)
return
def write(s, output=output, lineEnd='\n'):
# RFC 1521 requires that the line ending in a space or tab must have
# that trailing character encoded.
@ -60,9 +77,12 @@ def write(s, output=output, lineEnd='\n'):
stripped = '\n'
# Calculate the un-length-limited encoded line
for c in line:
if needsquoting(c, quotetabs):
if needsquoting(c, quotetabs, header):
c = quote(c)
outline.append(c)
if header and c == ' ':
outline.append('_')
else:
outline.append(c)
# First, write out the previous line
if prevline is not None:
write(prevline)
@ -80,19 +100,28 @@ def write(s, output=output, lineEnd='\n'):
if prevline is not None:
write(prevline, lineEnd=stripped)
def encodestring(s, quotetabs=0):
def encodestring(s, quotetabs = 0, header = 0):
if b2a_qp is not None:
return b2a_qp(s, quotetabs = quotetabs, header = header)
from cStringIO import StringIO
infp = StringIO(s)
outfp = StringIO()
encode(infp, outfp, quotetabs)
encode(infp, outfp, quotetabs, header)
return outfp.getvalue()
def decode(input, output):
def decode(input, output, header = 0):
"""Read 'input', apply quoted-printable decoding, and write to 'output'.
'input' and 'output' are files with readline() and write() methods.
If 'header' is true, decode underscore as space (per RFC 1522)."""
if a2b_qp is not None:
data = input.read()
odata = a2b_qp(data, header = header)
output.write(odata)
return
'input' and 'output' are files with readline() and write() methods."""
new = ''
while 1:
line = input.readline()
@ -107,7 +136,9 @@ def decode(input, output):
partial = 1
while i < n:
c = line[i]
if c != ESCAPE:
if c == '_' and header:
new = new + ' '; i = i+1
elif c != ESCAPE:
new = new + c; i = i+1
elif i+1 == n and not partial:
partial = 1; break
@ -123,11 +154,13 @@ def decode(input, output):
if new:
output.write(new)
def decodestring(s):
def decodestring(s, header = 0):
if a2b_qp is not None:
return a2b_qp(s, header = header)
from cStringIO import StringIO
infp = StringIO(s)
outfp = StringIO()
decode(infp, outfp)
decode(infp, outfp, header = header)
return outfp.getvalue()

View file

@ -104,6 +104,12 @@ class QuopriTestCase(unittest.TestCase):
('hello\tworld', 'hello=09world'),
)
# These are used in the "header=1" tests.
HSTRINGS = (
('hello world', 'hello_world'),
('hello_world', 'hello=5Fworld'),
)
def test_encodestring(self):
for p, e in self.STRINGS:
self.assert_(encodestring(p) == e)
@ -135,6 +141,13 @@ def test_embedded_ws(self):
self.assert_(encodestring(p, quotetabs=1) == e)
self.assert_(decodestring(e) == p)
def test_encode_header(self):
for p, e in self.HSTRINGS:
self.assert_(encodestring(p, header = 1) == e)
def test_decode_header(self):
for p, e in self.HSTRINGS:
self.assert_(decodestring(e, header = 1) == p)
def test_main():
test_support.run_unittest(QuopriTestCase)

View file

@ -6,8 +6,13 @@ Type/class unification and new-style classes
Core
- binascii has now two quopri support functions, a2b_qp and b2a_qp.
Library
- quopri's encode and decode methods take an optional header parameter,
which indicates whether output is intended for the header 'Q' encoding.
Tools
Build

View file

@ -42,6 +42,15 @@
** does make the performance sub-optimal. Oh well, too bad...
**
** Jack Jansen, CWI, July 1995.
**
** Added support for quoted-printable encoding, based on rfc 1521 et al
** quoted-printable encoding specifies that non printable characters (anything
** below 32 and above 126) be encoded as =XX where XX is the hexadecimal value
** of the character. It also specifies some other behavior to enable 8bit data
** in a mail message with little difficulty (maximum line sizes, protecting
** some cases of whitespace, etc).
**
** Brandon Long, September 2001.
*/
@ -971,6 +980,289 @@ static char doc_unhexlify[] =
hexstr must contain an even number of hex digits (upper or lower case).\n\
This function is also available as \"unhexlify()\"";
static int table_hex[128] = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1,
-1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
};
#define hexval(c) table_hex[(unsigned int)(c)]
#define MAXLINESIZE 76
static char doc_a2b_qp[] = "Decode a string of qp-encoded data";
static PyObject*
binascii_a2b_qp(PyObject *self, PyObject *args, PyObject *kwargs)
{
unsigned int in, out;
char ch;
unsigned char *data, *odata;
unsigned int datalen = 0;
PyObject *rv;
static char *kwlist[] = {"data", "header", NULL};
int header = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|i", kwlist, &data,
&datalen, &header))
return NULL;
/* We allocate the output same size as input, this is overkill */
odata = (char *) calloc(1, datalen);
if (odata == NULL) {
PyErr_NoMemory();
return NULL;
}
in = out = 0;
while (in < datalen) {
if (data[in] == '=') {
in++;
if (in >= datalen) break;
/* Soft line breaks */
if ((data[in] == '\n') || (data[in] == '\r') ||
(data[in] == ' ') || (data[in] == '\t')) {
if (data[in] != '\n') {
while (in < datalen && data[in] != '\n') in++;
}
if (in < datalen) in++;
}
else if (data[in] == '=') {
/* broken case from broken python qp */
odata[out++] = '=';
in++;
}
else if (((data[in] >= 'A' && data[in] <= 'F') ||
(data[in] >= 'a' && data[in] <= 'f') ||
(data[in] >= '0' && data[in] <= '9')) &&
((data[in+1] >= 'A' && data[in+1] <= 'F') ||
(data[in+1] >= 'a' && data[in+1] <= 'f') ||
(data[in+1] >= '0' && data[in+1] <= '9'))) {
/* hexval */
ch = hexval(data[in]) << 4;
in++;
ch |= hexval(data[in]);
in++;
odata[out++] = ch;
}
else {
odata[out++] = '=';
}
}
else if (header && data[in] == '_') {
odata[out++] = ' ';
in++;
}
else {
odata[out] = data[in];
in++;
out++;
}
}
if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
free (odata);
return NULL;
}
free (odata);
return rv;
}
static int
to_hex (unsigned char ch, unsigned char *s)
{
unsigned int uvalue = ch;
s[1] = "0123456789ABCDEF"[uvalue % 16];
uvalue = (uvalue / 16);
s[0] = "0123456789ABCDEF"[uvalue % 16];
return 0;
}
static char doc_b2a_qp[] =
"b2a_qp(data, quotetabs=0, istext=1, header=0) -> s; \n\
Encode a string using quoted-printable encoding. \n\
\n\
On encoding, when istext is set, newlines are not encoded, and white \n\
space at end of lines is. When istext is not set, \\r and \\n (CR/LF) are \n\
both encoded. When quotetabs is set, space and tabs are encoded.";
/* XXX: This is ridiculously complicated to be backward compatible
* (mostly) with the quopri module. It doesn't re-create the quopri
* module bug where text ending in CRLF has the CR encoded */
static PyObject*
binascii_b2a_qp (PyObject *self, PyObject *args, PyObject *kwargs)
{
unsigned int in, out;
unsigned char *data, *odata;
unsigned int datalen = 0, odatalen = 0;
PyObject *rv;
unsigned int linelen = 0;
static char *kwlist[] = {"data", "quotetabs", "istext", "header", NULL};
int istext = 1;
int quotetabs = 0;
int header = 0;
unsigned char ch;
int crlf = 0;
unsigned char *p;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|iii", kwlist, &data,
&datalen, &quotetabs, &istext, &header))
return NULL;
/* See if this string is using CRLF line ends */
/* XXX: this function has the side effect of converting all of
* the end of lines to be the same depending on this detection
* here */
p = strchr(data, '\n');
if ((p != NULL) && (p > data) && (*(p-1) == '\r'))
crlf = 1;
/* First, scan to see how many characters need to be encoded */
in = 0;
while (in < datalen) {
if ((data[in] > 126) ||
(data[in] == '=') ||
(header && data[in] == '_') ||
((data[in] == '.') && (linelen == 1)) ||
(!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
((data[in] < 33) &&
(data[in] != '\r') && (data[in] != '\n') &&
(quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
{
if ((linelen + 3) >= MAXLINESIZE) {
linelen = 0;
if (crlf)
odatalen += 3;
else
odatalen += 2;
}
linelen += 3;
odatalen += 3;
in++;
}
else {
if (istext &&
((data[in] == '\n') ||
((in+1 < datalen) && (data[in] == '\r') &&
(data[in+1] == '\n'))))
{
linelen = 0;
/* Protect against whitespace on end of line */
if (in && ((data[in-1] == ' ') || (data[in-1] == '\t')))
odatalen += 2;
if (crlf)
odatalen += 2;
else
odatalen += 1;
if (data[in] == '\r')
in += 2;
else
in++;
}
else {
if ((in + 1 != datalen) &&
(data[in+1] != '\n') &&
(linelen + 1) >= MAXLINESIZE) {
linelen = 0;
if (crlf)
odatalen += 3;
else
odatalen += 2;
}
linelen++;
odatalen++;
in++;
}
}
}
odata = (char *) calloc(1, odatalen);
if (odata == NULL) {
PyErr_NoMemory();
return NULL;
}
in = out = linelen = 0;
while (in < datalen) {
if ((data[in] > 126) ||
(data[in] == '=') ||
(header && data[in] == '_') ||
((data[in] == '.') && (linelen == 1)) ||
(!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
((data[in] < 33) &&
(data[in] != '\r') && (data[in] != '\n') &&
(quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
{
if ((linelen + 3 )>= MAXLINESIZE) {
odata[out++] = '=';
if (crlf) odata[out++] = '\r';
odata[out++] = '\n';
linelen = 0;
}
odata[out++] = '=';
to_hex(data[in], &odata[out]);
out += 2;
in++;
linelen += 3;
}
else {
if (istext &&
((data[in] == '\n') ||
((in+1 < datalen) && (data[in] == '\r') &&
(data[in+1] == '\n'))))
{
linelen = 0;
/* Protect against whitespace on end of line */
if (out && ((odata[out-1] == ' ') || (odata[out-1] == '\t'))) {
ch = odata[out-1];
odata[out-1] = '=';
to_hex(ch, &odata[out]);
out += 2;
}
if (crlf) odata[out++] = '\r';
odata[out++] = '\n';
if (data[in] == '\r')
in += 2;
else
in++;
}
else {
if ((in + 1 != datalen) &&
(data[in+1] != '\n') &&
(linelen + 1) >= MAXLINESIZE) {
odata[out++] = '=';
if (crlf) odata[out++] = '\r';
odata[out++] = '\n';
linelen = 0;
}
linelen++;
if (header && data[in] == ' ') {
odata[out++] = '_';
in++;
}
else {
odata[out++] = data[in++];
}
}
}
}
if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
free (odata);
return NULL;
}
free (odata);
return rv;
}
/* List of functions defined in the module */
@ -990,6 +1282,10 @@ static struct PyMethodDef binascii_module_methods[] = {
doc_rledecode_hqx},
{"crc_hqx", binascii_crc_hqx, METH_VARARGS, doc_crc_hqx},
{"crc32", binascii_crc32, METH_VARARGS, doc_crc32},
{"a2b_qp", (PyCFunction)binascii_a2b_qp, METH_VARARGS | METH_KEYWORDS,
doc_a2b_qp},
{"b2a_qp", (PyCFunction)binascii_b2a_qp, METH_VARARGS | METH_KEYWORDS,
doc_b2a_qp},
{NULL, NULL} /* sentinel */
};