bpo-43613: Faster implementation of gzip.compress and gzip.decompress (GH-27941)

Co-authored-by: Łukasz Langa <lukasz@langa.pl>
2024-09-04 15:56:13 +00:00 · 2021-09-02 17:02:59 +02:00 · 2021-09-02 17:02:59 +02:00 · ea23e7820f
parent a7ef15aae8
commit ea23e7820f
8 changed files with 195 additions and 89 deletions
--- a/Doc/library/gzip.rst
+++ b/Doc/library/gzip.rst
@ -174,19 +174,30 @@ The module defines the following items:

   Compress the *data*, returning a :class:`bytes` object containing
   the compressed data.  *compresslevel* and *mtime* have the same meaning as in
-   the :class:`GzipFile` constructor above.
+   the :class:`GzipFile` constructor above. When *mtime* is set to ``0``, this
+   function is equivalent to :func:`zlib.compress` with *wbits* set to ``31``.
+   The zlib function is faster.

   .. versionadded:: 3.2
   .. versionchanged:: 3.8
      Added the *mtime* parameter for reproducible output.
+   .. versionchanged:: 3.11
+      Speed is improved by compressing all data at once instead of in a
+      streamed fashion. Calls with *mtime* set to ``0`` are delegated to
+      :func:`zlib.compress` for better speed.

 .. function:: decompress(data)

   Decompress the *data*, returning a :class:`bytes` object containing the
-   uncompressed data.
+   uncompressed data. This function is capable of decompressing multi-member
+   gzip data (multiple gzip blocks concatenated together). When the data is
+   certain to contain only one member the :func:`zlib.decompress` function with
+   *wbits* set to 31 is faster.

   .. versionadded:: 3.2
-
+   .. versionchanged:: 3.11
+      Speed is improved by decompressing members at once in memory instead of in
+      a streamed fashion.

 .. _gzip-usage-examples:

--- a/Doc/library/zlib.rst
+++ b/Doc/library/zlib.rst
@ -47,7 +47,7 @@ The available exception and functions in this module are:
      platforms, use ``adler32(data) & 0xffffffff``.


-.. function:: compress(data, /, level=-1)
+.. function:: compress(data, /, level=-1, wbits=MAX_WBITS)

   Compresses the bytes in *data*, returning a bytes object containing compressed data.
   *level* is an integer from ``0`` to ``9`` or ``-1`` controlling the level of compression;
@ -55,26 +55,8 @@ The available exception and functions in this module are:
   is slowest and produces the most.  ``0`` (Z_NO_COMPRESSION) is no compression.
   The default value is ``-1`` (Z_DEFAULT_COMPRESSION).  Z_DEFAULT_COMPRESSION represents a default
   compromise between speed and compression (currently equivalent to level 6).
-   Raises the :exc:`error` exception if any error occurs.

-   .. versionchanged:: 3.6
-      *level* can now be used as a keyword parameter.
-
-
-.. function:: compressobj(level=-1, method=DEFLATED, wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=Z_DEFAULT_STRATEGY[, zdict])
-
-   Returns a compression object, to be used for compressing data streams that won't
-   fit into memory at once.
-
-   *level* is the compression level -- an integer from ``0`` to ``9`` or ``-1``.
-   A value of ``1`` (Z_BEST_SPEED) is fastest and produces the least compression,
-   while a value of ``9`` (Z_BEST_COMPRESSION) is slowest and produces the most.
-   ``0`` (Z_NO_COMPRESSION) is no compression.  The default value is ``-1`` (Z_DEFAULT_COMPRESSION).
-   Z_DEFAULT_COMPRESSION represents a default compromise between speed and compression
-   (currently equivalent to level 6).
-
-   *method* is the compression algorithm. Currently, the only supported value is
-   :const:`DEFLATED`.
+   .. _compress-wbits:

   The *wbits* argument controls the size of the history buffer (or the
   "window size") used when compressing data, and whether a header and
@ -94,6 +76,34 @@ The available exception and functions in this module are:
     window size logarithm, while including a basic :program:`gzip` header
     and trailing checksum in the output.

+   Raises the :exc:`error` exception if any error occurs.
+
+   .. versionchanged:: 3.6
+      *level* can now be used as a keyword parameter.
+
+   .. versionchanged:: 3.11
+      The *wbits* parameter is now available to set window bits and
+      compression type.
+
+.. function:: compressobj(level=-1, method=DEFLATED, wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=Z_DEFAULT_STRATEGY[, zdict])
+
+   Returns a compression object, to be used for compressing data streams that won't
+   fit into memory at once.
+
+   *level* is the compression level -- an integer from ``0`` to ``9`` or ``-1``.
+   A value of ``1`` (Z_BEST_SPEED) is fastest and produces the least compression,
+   while a value of ``9`` (Z_BEST_COMPRESSION) is slowest and produces the most.
+   ``0`` (Z_NO_COMPRESSION) is no compression.  The default value is ``-1`` (Z_DEFAULT_COMPRESSION).
+   Z_DEFAULT_COMPRESSION represents a default compromise between speed and compression
+   (currently equivalent to level 6).
+
+   *method* is the compression algorithm. Currently, the only supported value is
+   :const:`DEFLATED`.
+
+   The *wbits* parameter controls the size of the history buffer (or the
+   "window size"), and what header and trailer format will be used. It has
+   the same meaning as `described for compress() <#compress-wbits>`__.
+
   The *memLevel* argument controls the amount of memory used for the
   internal compression state. Valid values range from ``1`` to ``9``.
   Higher values use more memory, but are faster and produce smaller output.
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@ -403,6 +403,59 @@ def __iter__(self):
        return self._buffer.__iter__()


+def _read_exact(fp, n):
+    '''Read exactly *n* bytes from `fp`
+
+    This method is required because fp may be unbuffered,
+    i.e. return short reads.
+    '''
+    data = fp.read(n)
+    while len(data) < n:
+        b = fp.read(n - len(data))
+        if not b:
+            raise EOFError("Compressed file ended before the "
+                           "end-of-stream marker was reached")
+        data += b
+    return data
+
+
+def _read_gzip_header(fp):
+    '''Read a gzip header from `fp` and progress to the end of the header.
+
+    Returns last mtime if header was present or None otherwise.
+    '''
+    magic = fp.read(2)
+    if magic == b'':
+        return None
+
+    if magic != b'\037\213':
+        raise BadGzipFile('Not a gzipped file (%r)' % magic)
+
+    (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
+    if method != 8:
+        raise BadGzipFile('Unknown compression method')
+
+    if flag & FEXTRA:
+        # Read & discard the extra field, if present
+        extra_len, = struct.unpack("<H", _read_exact(fp, 2))
+        _read_exact(fp, extra_len)
+    if flag & FNAME:
+        # Read and discard a null-terminated string containing the filename
+        while True:
+            s = fp.read(1)
+            if not s or s==b'\000':
+                break
+    if flag & FCOMMENT:
+        # Read and discard a null-terminated string containing a comment
+        while True:
+            s = fp.read(1)
+            if not s or s==b'\000':
+                break
+    if flag & FHCRC:
+        _read_exact(fp, 2)     # Read & discard the 16-bit header CRC
+    return last_mtime
+
+
 class _GzipReader(_compression.DecompressReader):
    def __init__(self, fp):
        super().__init__(_PaddedFile(fp), zlib.decompressobj,
@ -415,53 +468,11 @@ def _init_read(self):
        self._crc = zlib.crc32(b"")
        self._stream_size = 0  # Decompressed size of unconcatenated stream

-    def _read_exact(self, n):
-        '''Read exactly *n* bytes from `self._fp`
-
-        This method is required because self._fp may be unbuffered,
-        i.e. return short reads.
-        '''
-
-        data = self._fp.read(n)
-        while len(data) < n:
-            b = self._fp.read(n - len(data))
-            if not b:
-                raise EOFError("Compressed file ended before the "
-                               "end-of-stream marker was reached")
-            data += b
-        return data
-
    def _read_gzip_header(self):
-        magic = self._fp.read(2)
-        if magic == b'':
+        last_mtime = _read_gzip_header(self._fp)
+        if last_mtime is None:
            return False
-
-        if magic != b'\037\213':
-            raise BadGzipFile('Not a gzipped file (%r)' % magic)
-
-        (method, flag,
-         self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
-        if method != 8:
-            raise BadGzipFile('Unknown compression method')
-
-        if flag & FEXTRA:
-            # Read & discard the extra field, if present
-            extra_len, = struct.unpack("<H", self._read_exact(2))
-            self._read_exact(extra_len)
-        if flag & FNAME:
-            # Read and discard a null-terminated string containing the filename
-            while True:
-                s = self._fp.read(1)
-                if not s or s==b'\000':
-                    break
-        if flag & FCOMMENT:
-            # Read and discard a null-terminated string containing a comment
-            while True:
-                s = self._fp.read(1)
-                if not s or s==b'\000':
-                    break
-        if flag & FHCRC:
-            self._read_exact(2)     # Read & discard the 16-bit header CRC
+        self._last_mtime = last_mtime
        return True

    def read(self, size=-1):
@ -524,7 +535,7 @@ def _read_eof(self):
        # We check that the computed CRC and size of the
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
-        crc32, isize = struct.unpack("<II", self._read_exact(8))
+        crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
        if crc32 != self._crc:
            raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
                                                             hex(self._crc)))
@ -544,21 +555,65 @@ def _rewind(self):
        super()._rewind()
        self._new_member = True

+
+def _create_simple_gzip_header(compresslevel: int,
+                               mtime = None) -> bytes:
+    """
+    Write a simple gzip header with no extra fields.
+    :param compresslevel: Compresslevel used to determine the xfl bytes.
+    :param mtime: The mtime (must support conversion to a 32-bit integer).
+    :return: A bytes object representing the gzip header.
+    """
+    if mtime is None:
+        mtime = time.time()
+    if compresslevel == _COMPRESS_LEVEL_BEST:
+        xfl = 2
+    elif compresslevel == _COMPRESS_LEVEL_FAST:
+        xfl = 4
+    else:
+        xfl = 0
+    # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
+    # fields added to header), mtime, xfl and os (255 for unknown OS).
+    return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
+
+
 def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
    """Compress data in one shot and return the compressed string.
-    Optional argument is the compression level, in range of 0-9.
+
+    compresslevel sets the compression level in range of 0-9.
+    mtime can be used to set the modification time. The modification time is
+    set to the current time by default.
    """
-    buf = io.BytesIO()
-    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
-        f.write(data)
-    return buf.getvalue()
+    if mtime == 0:
+        # Use zlib as it creates the header with 0 mtime by default.
+        # This is faster and with less overhead.
+        return zlib.compress(data, level=compresslevel, wbits=31)
+    header = _create_simple_gzip_header(compresslevel, mtime)
+    trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
+    # Wbits=-15 creates a raw deflate block.
+    return header + zlib.compress(data, wbits=-15) + trailer
+

 def decompress(data):
    """Decompress a gzip compressed string in one shot.
    Return the decompressed string.
    """
-    with GzipFile(fileobj=io.BytesIO(data)) as f:
-        return f.read()
+    decompressed_members = []
+    while True:
+        fp = io.BytesIO(data)
+        if _read_gzip_header(fp) is None:
+            return b"".join(decompressed_members)
+        # Use a zlib raw deflate compressor
+        do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
+        # Read all the data except the header
+        decompressed = do.decompress(data[fp.tell():])
+        crc, length = struct.unpack("<II", do.unused_data[:8])
+        if crc != zlib.crc32(decompressed):
+            raise BadGzipFile("CRC check failed")
+        if length != (len(decompressed) & 0xffffffff):
+            raise BadGzipFile("Incorrect length of data produced")
+        decompressed_members.append(decompressed)
+        data = do.unused_data[8:].lstrip(b"\x00")


 def main():
--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@ -831,6 +831,13 @@ def test_wbits(self):
        dco = zlib.decompressobj(32 + 15)
        self.assertEqual(dco.decompress(gzip), HAMLET_SCENE)

+        for wbits in (-15, 15, 31):
+            with self.subTest(wbits=wbits):
+                expected = HAMLET_SCENE
+                actual = zlib.decompress(
+                    zlib.compress(HAMLET_SCENE, wbits=wbits), wbits=wbits
+                )
+                self.assertEqual(expected, actual)

 def choose_lines(source, number, seed=None, generator=random):
    """Return a list of number lines randomly chosen from the source"""
--- a/Misc/NEWS.d/next/Library/2021-03-24-09-40-02.bpo-43612.vMGZ4y.rst
+++ b/Misc/NEWS.d/next/Library/2021-03-24-09-40-02.bpo-43612.vMGZ4y.rst
@ -0,0 +1,5 @@
+:func:`zlib.compress` now accepts a wbits parameter which allows users to
+compress data as a raw deflate block without zlib headers and trailers in
+one go. Previously this required instantiating a ``zlib.compressobj``. It
+also provides a faster alternative to ``gzip.compress`` when wbits=31 is
+used.
--- a/Misc/NEWS.d/next/Library/2021-08-25-10-28-49.bpo-43613.WkYmI0.rst
+++ b/Misc/NEWS.d/next/Library/2021-08-25-10-28-49.bpo-43613.WkYmI0.rst
@ -0,0 +1,3 @@
+Improve the speed of :func:`gzip.compress` and :func:`gzip.decompress` by
+compressing and decompressing at once in memory instead of in a streamed
+fashion.
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@ -3,7 +3,7 @@ preserve
 [clinic start generated code]*/

 PyDoc_STRVAR(zlib_compress__doc__,
-"compress($module, data, /, level=Z_DEFAULT_COMPRESSION)\n"
+"compress($module, data, /, level=Z_DEFAULT_COMPRESSION, wbits=MAX_WBITS)\n"
 "--\n"
 "\n"
 "Returns a bytes object containing compressed data.\n"
@ -11,26 +11,29 @@ PyDoc_STRVAR(zlib_compress__doc__,
 "  data\n"
 "    Binary data to be compressed.\n"
 "  level\n"
-"    Compression level, in 0-9 or -1.");
+"    Compression level, in 0-9 or -1.\n"
+"  wbits\n"
+"    The window buffer size and container format.");

 #define ZLIB_COMPRESS_METHODDEF    \
    {"compress", (PyCFunction)(void(*)(void))zlib_compress, METH_FASTCALL|METH_KEYWORDS, zlib_compress__doc__},

 static PyObject *
-zlib_compress_impl(PyObject *module, Py_buffer *data, int level);
+zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits);

 static PyObject *
 zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
 {
    PyObject *return_value = NULL;
-    static const char * const _keywords[] = {"", "level", NULL};
+    static const char * const _keywords[] = {"", "level", "wbits", NULL};
    static _PyArg_Parser _parser = {NULL, _keywords, "compress", 0};
-    PyObject *argsbuf[2];
+    PyObject *argsbuf[3];
    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
    Py_buffer data = {NULL, NULL};
    int level = Z_DEFAULT_COMPRESSION;
+    int wbits = MAX_WBITS;

-    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf);
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 3, 0, argsbuf);
    if (!args) {
        goto exit;
    }
@ -44,12 +47,21 @@ zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
    if (!noptargs) {
        goto skip_optional_pos;
    }
-    level = _PyLong_AsInt(args[1]);
-    if (level == -1 && PyErr_Occurred()) {
+    if (args[1]) {
+        level = _PyLong_AsInt(args[1]);
+        if (level == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_pos;
+        }
+    }
+    wbits = _PyLong_AsInt(args[2]);
+    if (wbits == -1 && PyErr_Occurred()) {
        goto exit;
    }
 skip_optional_pos:
-    return_value = zlib_compress_impl(module, &data, level);
+    return_value = zlib_compress_impl(module, &data, level, wbits);

 exit:
    /* Cleanup for data */
@ -803,4 +815,4 @@ exit:
 #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
    #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
 #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
-/*[clinic end generated code: output=6736bae59fab268b input=a9049054013a1b77]*/
+/*[clinic end generated code: output=e3e8a6142ea045a7 input=a9049054013a1b77]*/
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@ -310,13 +310,15 @@ zlib.compress
    /
    level: int(c_default="Z_DEFAULT_COMPRESSION") = Z_DEFAULT_COMPRESSION
        Compression level, in 0-9 or -1.
+    wbits: int(c_default="MAX_WBITS") = MAX_WBITS
+        The window buffer size and container format.

 Returns a bytes object containing compressed data.
 [clinic start generated code]*/

 static PyObject *
-zlib_compress_impl(PyObject *module, Py_buffer *data, int level)
-/*[clinic end generated code: output=d80906d73f6294c8 input=638d54b6315dbed3]*/
+zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
+/*[clinic end generated code: output=46bd152fadd66df2 input=c4d06ee5782a7e3f]*/
 {
    PyObject *RetVal;
    int flush;
@ -336,7 +338,8 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level)
    zst.zalloc = PyZlib_Malloc;
    zst.zfree = PyZlib_Free;
    zst.next_in = ibuf;
-    int err = deflateInit(&zst, level);
+    int err = deflateInit2(&zst, level, DEFLATED, wbits, DEF_MEM_LEVEL,
+                           Z_DEFAULT_STRATEGY);

    switch (err) {
    case Z_OK: