Patch #462190, patch #464070: Support quoted printable in the binascii module.

Decode and encode underscores for header style encoding. Fixes bug #463996.
2024-09-06 10:59:04 +00:00 · 2001-09-30 20:32:11 +00:00 · 2001-09-30 20:32:11 +00:00 · 16dc7f44b1
parent 5f12d755a8
commit 16dc7f44b1
6 changed files with 391 additions and 18 deletions
--- a/Doc/lib/libbinascii.tex
+++ b/Doc/lib/libbinascii.tex
@ -40,6 +40,24 @@ The length of \var{data} should be at most 57 to adhere to the base64
 standard.
 \end{funcdesc}

+\begin{funcdesc}{a2b_qp}{string\optional{, header}}
+Convert a block of quoted-printable data back to binary and return the
+binary data. More than one line may be passed at a time.
+If the optional argument \var{header} is present and true, underscores
+will be decoded as spaces.
+\end{funcdesc}
+
+\begin{funcdesc}{b2a_qp}{data\optional{, quotetabs, istext, header}}
+Convert binary data to a line(s) of \ASCII{} characters in
+quoted-printable encoding.  The return value is the converted line(s).
+If the optional argument \var{quotetabs} is present and true, all tabs
+and spaces will be encoded.  If the optional argument \var{header} is
+present and true, spaces will be encoded as underscores per RFC1522.
+If the optional argument \var{header} is present and false, newline
+characters will be encoded as well, otherwise linefeed conversion might
+corrupt the binary data stream.
+\end{funcdesc}
+
 \begin{funcdesc}{a2b_hqx}{string}
 Convert binhex4 formatted \ASCII{} data to binary, without doing
 RLE-decompression. The string should contain a complete number of
@ -118,4 +136,6 @@ again.
  \seemodule{binhex}{Support for the binhex format used on the Macintosh.}

  \seemodule{uu}{Support for UU encoding used on \UNIX.}
+
+  \seemodule{quopri}{Support for quoted-printable encoding used in MIME email messages. }
 \end{seealso}
--- a/Doc/lib/libquopri.tex
+++ b/Doc/lib/libquopri.tex
@ -7,21 +7,27 @@


 This module performs quoted-printable transport encoding and decoding,
-as defined in \rfc{1521}: ``MIME (Multipurpose Internet Mail Extensions)
-Part One''.  The quoted-printable encoding is designed for data where
-there are relatively few nonprintable characters; the base64 encoding
-scheme available via the \refmodule{base64} module is more compact if there
-are many such characters, as when sending a graphics file.
+as defined in \rfc{1521}: ``MIME (Multipurpose Internet Mail
+Extensions) Part One: Mechanisms for Specifying and Describing the
+Format of Internet Message Bodies''.  The quoted-printable encoding is
+designed for data where there are relatively few nonprintable
+characters; the base64 encoding scheme available via the
+\refmodule{base64} module is more compact if there are many such
+characters, as when sending a graphics file.
 \indexii{quoted-printable}{encoding}
 \index{MIME!quoted-printable encoding}


-\begin{funcdesc}{decode}{input, output}
+\begin{funcdesc}{decode}{input, output\optional{,header}}
 Decode the contents of the \var{input} file and write the resulting
 decoded binary data to the \var{output} file.
 \var{input} and \var{output} must either be file objects or objects that
 mimic the file object interface. \var{input} will be read until
 \code{\var{input}.readline()} returns an empty string.
+If the optional argument \var{header} is present and true, underscore
+will be decoded as space. This is used to decode
+``Q''-encoded headers as described in \rfc{1522}: ``MIME (Multipurpose Internet Mail Extensions)
+Part Two: Message Header Extensions for Non-ASCII Text''.
 \end{funcdesc}

 \begin{funcdesc}{encode}{input, output, quotetabs}
@ -36,7 +42,7 @@ when false it leaves them unencoded.  Note that spaces and tabs
 appearing at the end of lines are always encoded, as per \rfc{1521}.
 \end{funcdesc}

-\begin{funcdesc}{decodestring}{s}
+\begin{funcdesc}{decodestring}{s\optional{,header}}
 Like \function{decode()}, except that it accepts a source string and
 returns the corresponding decoded string.
 \end{funcdesc}
--- a/Lib/quopri.py
+++ b/Lib/quopri.py
@ -11,9 +11,14 @@
 HEX = '0123456789ABCDEF'
 EMPTYSTRING = ''

+try:
+  from binascii import a2b_qp, b2a_qp
+except:
+  a2b_qp = None
+  b2a_qp = None


-def needsquoting(c, quotetabs):
+def needsquoting(c, quotetabs, header):
    """Decide whether a particular character needs to be quoted.

    The 'quotetabs' flag indicates whether embedded tabs and spaces should be
@ -22,6 +27,9 @@ def needsquoting(c, quotetabs):
    """
    if c in ' \t':
        return quotetabs
+    # if header, we have to escape _ because _ is used to escape space
+    if c == '_': 
+        return header
    return c == ESCAPE or not (' ' <= c <= '~')

 def quote(c):
@ -31,14 +39,23 @@ def quote(c):



-def encode(input, output, quotetabs):
+def encode(input, output, quotetabs, header = 0):
    """Read 'input', apply quoted-printable encoding, and write to 'output'.

    'input' and 'output' are files with readline() and write() methods.
    The 'quotetabs' flag indicates whether embedded tabs and spaces should be
    quoted.  Note that line-ending tabs and spaces are always encoded, as per
    RFC 1521.
+    The 'header' flag indicates whether we are encoding spaces as _ as per
+    RFC 1522.
    """
+
+    if b2a_qp is not None:
+        data = input.read()
+        odata = b2a_qp(data, quotetabs = quotetabs, header = header)
+        output.write(odata)
+        return
+      
    def write(s, output=output, lineEnd='\n'):
        # RFC 1521 requires that the line ending in a space or tab must have
        # that trailing character encoded.
@ -60,9 +77,12 @@ def write(s, output=output, lineEnd='\n'):
            stripped = '\n'
        # Calculate the un-length-limited encoded line
        for c in line:
-            if needsquoting(c, quotetabs):
+            if needsquoting(c, quotetabs, header):
                c = quote(c)
-            outline.append(c)
+            if header and c == ' ':
+                outline.append('_')
+            else:
+                outline.append(c)
        # First, write out the previous line
        if prevline is not None:
            write(prevline)
@ -80,19 +100,28 @@ def write(s, output=output, lineEnd='\n'):
    if prevline is not None:
        write(prevline, lineEnd=stripped)

-def encodestring(s, quotetabs=0):
+def encodestring(s, quotetabs = 0, header = 0):
+    if b2a_qp is not None:
+        return b2a_qp(s, quotetabs = quotetabs, header = header)
    from cStringIO import StringIO
    infp = StringIO(s)
    outfp = StringIO()
-    encode(infp, outfp, quotetabs)
+    encode(infp, outfp, quotetabs, header)
    return outfp.getvalue()



-def decode(input, output):
+def decode(input, output, header = 0):
    """Read 'input', apply quoted-printable decoding, and write to 'output'.
+    'input' and 'output' are files with readline() and write() methods.
+    If 'header' is true, decode underscore as space (per RFC 1522)."""
+
+    if a2b_qp is not None:
+        data = input.read()
+        odata = a2b_qp(data, header = header)
+        output.write(odata)
+        return

-    'input' and 'output' are files with readline() and write() methods."""
    new = ''
    while 1:
        line = input.readline()
@ -107,7 +136,9 @@ def decode(input, output):
            partial = 1
        while i < n:
            c = line[i]
-            if c != ESCAPE:
+            if c == '_' and header:
+                new = new + ' '; i = i+1
+            elif c != ESCAPE:
                new = new + c; i = i+1
            elif i+1 == n and not partial:
                partial = 1; break
@ -123,11 +154,13 @@ def decode(input, output):
    if new:
        output.write(new)

-def decodestring(s):
+def decodestring(s, header = 0):
+    if a2b_qp is not None:
+        return a2b_qp(s, header = header)
    from cStringIO import StringIO
    infp = StringIO(s)
    outfp = StringIO()
-    decode(infp, outfp)
+    decode(infp, outfp, header = header)
    return outfp.getvalue()


--- a/Lib/test/test_quopri.py
+++ b/Lib/test/test_quopri.py
@ -104,6 +104,12 @@ class QuopriTestCase(unittest.TestCase):
        ('hello\tworld', 'hello=09world'),
        )

+    # These are used in the "header=1" tests.
+    HSTRINGS = (
+        ('hello world', 'hello_world'),
+        ('hello_world', 'hello=5Fworld'),
+        )
+
    def test_encodestring(self):
        for p, e in self.STRINGS:
            self.assert_(encodestring(p) == e)
@ -135,6 +141,13 @@ def test_embedded_ws(self):
            self.assert_(encodestring(p, quotetabs=1) == e)
            self.assert_(decodestring(e) == p)

+    def test_encode_header(self):
+        for p, e in self.HSTRINGS:
+            self.assert_(encodestring(p, header = 1) == e)
+
+    def test_decode_header(self):
+        for p, e in self.HSTRINGS:
+            self.assert_(decodestring(e, header = 1) == p)

 def test_main():
    test_support.run_unittest(QuopriTestCase)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -6,8 +6,13 @@ Type/class unification and new-style classes

 Core

+- binascii has now two quopri support functions, a2b_qp and b2a_qp.
+
 Library

+- quopri's encode and decode methods take an optional header parameter,
+  which indicates whether output is intended for the header 'Q' encoding.
+
 Tools

 Build
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@ -42,6 +42,15 @@
 ** does make the performance sub-optimal. Oh well, too bad...
 **
 ** Jack Jansen, CWI, July 1995.
+** 
+** Added support for quoted-printable encoding, based on rfc 1521 et al
+** quoted-printable encoding specifies that non printable characters (anything 
+** below 32 and above 126) be encoded as =XX where XX is the hexadecimal value
+** of the character.  It also specifies some other behavior to enable 8bit data
+** in a mail message with little difficulty (maximum line sizes, protecting 
+** some cases of whitespace, etc).    
+**
+** Brandon Long, September 2001.
 */


@ -971,6 +980,289 @@ static char doc_unhexlify[] =
 hexstr must contain an even number of hex digits (upper or lower case).\n\
 This function is also available as \"unhexlify()\"";

+static int table_hex[128] = {
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+   0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,
+  -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
+};
+
+#define hexval(c) table_hex[(unsigned int)(c)]
+
+#define MAXLINESIZE 76
+
+static char doc_a2b_qp[] = "Decode a string of qp-encoded data";
+
+static PyObject* 
+binascii_a2b_qp(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+	unsigned int in, out;
+	char ch;
+	unsigned char *data, *odata;
+	unsigned int datalen = 0;
+	PyObject *rv;
+	static char *kwlist[] = {"data", "header", NULL};
+	int header = 0;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|i", kwlist, &data, 
+	      &datalen, &header))
+		return NULL;
+
+	/* We allocate the output same size as input, this is overkill */
+	odata = (char *) calloc(1, datalen);
+
+	if (odata == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+
+	in = out = 0;
+	while (in < datalen) {
+		if (data[in] == '=') {
+			in++;
+			if (in >= datalen) break;
+			/* Soft line breaks */
+			if ((data[in] == '\n') || (data[in] == '\r') || 
+			    (data[in] == ' ') || (data[in] == '\t')) {
+				if (data[in] != '\n') {
+					while (in < datalen && data[in] != '\n') in++;
+				}
+				if (in < datalen) in++;
+			}
+			else if (data[in] == '=') {
+				/* broken case from broken python qp */
+				odata[out++] = '=';
+				in++;
+			}
+			else if (((data[in] >= 'A' && data[in] <= 'F') || 
+			          (data[in] >= 'a' && data[in] <= 'f') ||
+				  (data[in] >= '0' && data[in] <= '9')) &&
+			         ((data[in+1] >= 'A' && data[in+1] <= 'F') ||
+				  (data[in+1] >= 'a' && data[in+1] <= 'f') ||
+				  (data[in+1] >= '0' && data[in+1] <= '9'))) {
+				/* hexval */
+				ch = hexval(data[in]) << 4;
+				in++;
+				ch |= hexval(data[in]);
+				in++;
+				odata[out++] = ch;
+			}
+			else {
+			  odata[out++] = '=';
+			}
+		}
+		else if (header && data[in] == '_') {
+			odata[out++] = ' ';
+			in++;
+		}
+		else {
+			odata[out] = data[in];
+			in++;
+			out++;
+		}
+	}
+	if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
+		free (odata);
+		return NULL;
+	}
+	free (odata);
+	return rv;
+}
+
+static int 
+to_hex (unsigned char ch, unsigned char *s)
+{
+	unsigned int uvalue = ch;
+
+	s[1] = "0123456789ABCDEF"[uvalue % 16];
+	uvalue = (uvalue / 16);
+	s[0] = "0123456789ABCDEF"[uvalue % 16];
+	return 0;
+}
+
+static char doc_b2a_qp[] = 
+"b2a_qp(data, quotetabs=0, istext=1, header=0) -> s; \n\
+ Encode a string using quoted-printable encoding. \n\
+\n\
+On encoding, when istext is set, newlines are not encoded, and white \n\
+space at end of lines is.  When istext is not set, \\r and \\n (CR/LF) are \n\
+both encoded.  When quotetabs is set, space and tabs are encoded.";
+
+/* XXX: This is ridiculously complicated to be backward compatible
+ * (mostly) with the quopri module.  It doesn't re-create the quopri
+ * module bug where text ending in CRLF has the CR encoded */
+static PyObject* 
+binascii_b2a_qp (PyObject *self, PyObject *args, PyObject *kwargs)
+{
+	unsigned int in, out;
+	unsigned char *data, *odata;
+	unsigned int datalen = 0, odatalen = 0;
+	PyObject *rv;
+	unsigned int linelen = 0;
+	static char *kwlist[] = {"data", "quotetabs", "istext", "header", NULL};
+	int istext = 1;
+	int quotetabs = 0;
+	int header = 0;
+	unsigned char ch;
+	int crlf = 0;
+	unsigned char *p;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|iii", kwlist, &data, 
+	      &datalen, &quotetabs, &istext, &header))
+		return NULL;
+
+	/* See if this string is using CRLF line ends */
+	/* XXX: this function has the side effect of converting all of
+	 * the end of lines to be the same depending on this detection
+	 * here */
+	p = strchr(data, '\n');
+	if ((p != NULL) && (p > data) && (*(p-1) == '\r'))
+		crlf = 1;
+
+	/* First, scan to see how many characters need to be encoded */
+	in = 0;
+	while (in < datalen) {
+		if ((data[in] > 126) || 
+		    (data[in] == '=') ||
+		    (header && data[in] == '_') ||
+		    ((data[in] == '.') && (linelen == 1)) ||
+		    (!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
+		    ((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
+		    ((data[in] < 33) && 
+		     (data[in] != '\r') && (data[in] != '\n') && 
+		     (quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
+		{
+			if ((linelen + 3) >= MAXLINESIZE) {
+				linelen = 0;
+				if (crlf)
+					odatalen += 3;
+				else
+					odatalen += 2;
+			}
+			linelen += 3;
+			odatalen += 3;
+			in++;
+		}
+		else {
+		  	if (istext && 
+			    ((data[in] == '\n') ||
+			     ((in+1 < datalen) && (data[in] == '\r') &&
+			     (data[in+1] == '\n'))))
+			{
+			  	linelen = 0;
+				/* Protect against whitespace on end of line */
+				if (in && ((data[in-1] == ' ') || (data[in-1] == '\t')))
+					odatalen += 2;
+				if (crlf)
+					odatalen += 2;
+				else
+					odatalen += 1;
+				if (data[in] == '\r')
+					in += 2;
+				else
+					in++;
+			}
+			else {
+				if ((in + 1 != datalen) && 
+				    (data[in+1] != '\n') &&
+				    (linelen + 1) >= MAXLINESIZE) {
+					linelen = 0;
+					if (crlf)
+						odatalen += 3;
+					else
+						odatalen += 2;
+				}
+				linelen++;
+				odatalen++;
+				in++;
+			}
+		}
+	}
+
+	odata = (char *) calloc(1, odatalen);
+
+	if (odata == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+
+	in = out = linelen = 0;
+	while (in < datalen) {
+		if ((data[in] > 126) || 
+		    (data[in] == '=') ||
+		    (header && data[in] == '_') ||
+		    ((data[in] == '.') && (linelen == 1)) ||
+		    (!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
+		    ((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
+		    ((data[in] < 33) && 
+		     (data[in] != '\r') && (data[in] != '\n') && 
+		     (quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
+		{
+			if ((linelen + 3 )>= MAXLINESIZE) {
+				odata[out++] = '=';
+				if (crlf) odata[out++] = '\r';
+				odata[out++] = '\n';
+				linelen = 0;
+			}
+			odata[out++] = '=';
+			to_hex(data[in], &odata[out]);
+			out += 2;
+			in++;
+			linelen += 3;
+		}
+		else {
+		  	if (istext && 
+			    ((data[in] == '\n') ||
+			     ((in+1 < datalen) && (data[in] == '\r') &&
+			     (data[in+1] == '\n'))))
+			{
+			  	linelen = 0;
+				/* Protect against whitespace on end of line */
+				if (out && ((odata[out-1] == ' ') || (odata[out-1] == '\t'))) {
+					ch = odata[out-1];
+					odata[out-1] = '=';
+					to_hex(ch, &odata[out]);
+					out += 2;
+				}
+					
+				if (crlf) odata[out++] = '\r';
+				odata[out++] = '\n';
+				if (data[in] == '\r')
+					in += 2;
+				else
+					in++;
+			}
+			else {
+				if ((in + 1 != datalen) && 
+				    (data[in+1] != '\n') &&
+				    (linelen + 1) >= MAXLINESIZE) {
+					odata[out++] = '=';
+					if (crlf) odata[out++] = '\r';
+					odata[out++] = '\n';
+					linelen = 0;
+				}
+				linelen++;
+				if (header && data[in] == ' ') {
+					odata[out++] = '_';
+					in++;
+				}
+				else {
+					odata[out++] = data[in++];
+				}
+			}
+		}
+	}
+	if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
+		free (odata);
+		return NULL;
+	}
+	free (odata);
+	return rv;
+}

 /* List of functions defined in the module */

@ -990,6 +1282,10 @@ static struct PyMethodDef binascii_module_methods[] = {
 	 doc_rledecode_hqx},
 	{"crc_hqx",    binascii_crc_hqx,    METH_VARARGS, doc_crc_hqx},
 	{"crc32",      binascii_crc32,      METH_VARARGS, doc_crc32},
+	{"a2b_qp", (PyCFunction)binascii_a2b_qp, METH_VARARGS | METH_KEYWORDS, 
+	  doc_a2b_qp},
+	{"b2a_qp", (PyCFunction)binascii_b2a_qp, METH_VARARGS | METH_KEYWORDS, 
+          doc_b2a_qp},
 	{NULL, NULL}			     /* sentinel */
 };