mirror of
https://github.com/python/cpython
synced 2024-09-04 15:56:13 +00:00
bpo-43613: Faster implementation of gzip.compress and gzip.decompress (GH-27941)
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
This commit is contained in:
parent
a7ef15aae8
commit
ea23e7820f
|
@ -174,19 +174,30 @@ The module defines the following items:
|
|||
|
||||
Compress the *data*, returning a :class:`bytes` object containing
|
||||
the compressed data. *compresslevel* and *mtime* have the same meaning as in
|
||||
the :class:`GzipFile` constructor above.
|
||||
the :class:`GzipFile` constructor above. When *mtime* is set to ``0``, this
|
||||
function is equivalent to :func:`zlib.compress` with *wbits* set to ``31``.
|
||||
The zlib function is faster.
|
||||
|
||||
.. versionadded:: 3.2
|
||||
.. versionchanged:: 3.8
|
||||
Added the *mtime* parameter for reproducible output.
|
||||
.. versionchanged:: 3.11
|
||||
Speed is improved by compressing all data at once instead of in a
|
||||
streamed fashion. Calls with *mtime* set to ``0`` are delegated to
|
||||
:func:`zlib.compress` for better speed.
|
||||
|
||||
.. function:: decompress(data)
|
||||
|
||||
Decompress the *data*, returning a :class:`bytes` object containing the
|
||||
uncompressed data.
|
||||
uncompressed data. This function is capable of decompressing multi-member
|
||||
gzip data (multiple gzip blocks concatenated together). When the data is
|
||||
certain to contain only one member the :func:`zlib.decompress` function with
|
||||
*wbits* set to 31 is faster.
|
||||
|
||||
.. versionadded:: 3.2
|
||||
|
||||
.. versionchanged:: 3.11
|
||||
Speed is improved by decompressing members at once in memory instead of in
|
||||
a streamed fashion.
|
||||
|
||||
.. _gzip-usage-examples:
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ The available exception and functions in this module are:
|
|||
platforms, use ``adler32(data) & 0xffffffff``.
|
||||
|
||||
|
||||
.. function:: compress(data, /, level=-1)
|
||||
.. function:: compress(data, /, level=-1, wbits=MAX_WBITS)
|
||||
|
||||
Compresses the bytes in *data*, returning a bytes object containing compressed data.
|
||||
*level* is an integer from ``0`` to ``9`` or ``-1`` controlling the level of compression;
|
||||
|
@ -55,26 +55,8 @@ The available exception and functions in this module are:
|
|||
is slowest and produces the most. ``0`` (Z_NO_COMPRESSION) is no compression.
|
||||
The default value is ``-1`` (Z_DEFAULT_COMPRESSION). Z_DEFAULT_COMPRESSION represents a default
|
||||
compromise between speed and compression (currently equivalent to level 6).
|
||||
Raises the :exc:`error` exception if any error occurs.
|
||||
|
||||
.. versionchanged:: 3.6
|
||||
*level* can now be used as a keyword parameter.
|
||||
|
||||
|
||||
.. function:: compressobj(level=-1, method=DEFLATED, wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=Z_DEFAULT_STRATEGY[, zdict])
|
||||
|
||||
Returns a compression object, to be used for compressing data streams that won't
|
||||
fit into memory at once.
|
||||
|
||||
*level* is the compression level -- an integer from ``0`` to ``9`` or ``-1``.
|
||||
A value of ``1`` (Z_BEST_SPEED) is fastest and produces the least compression,
|
||||
while a value of ``9`` (Z_BEST_COMPRESSION) is slowest and produces the most.
|
||||
``0`` (Z_NO_COMPRESSION) is no compression. The default value is ``-1`` (Z_DEFAULT_COMPRESSION).
|
||||
Z_DEFAULT_COMPRESSION represents a default compromise between speed and compression
|
||||
(currently equivalent to level 6).
|
||||
|
||||
*method* is the compression algorithm. Currently, the only supported value is
|
||||
:const:`DEFLATED`.
|
||||
.. _compress-wbits:
|
||||
|
||||
The *wbits* argument controls the size of the history buffer (or the
|
||||
"window size") used when compressing data, and whether a header and
|
||||
|
@ -94,6 +76,34 @@ The available exception and functions in this module are:
|
|||
window size logarithm, while including a basic :program:`gzip` header
|
||||
and trailing checksum in the output.
|
||||
|
||||
Raises the :exc:`error` exception if any error occurs.
|
||||
|
||||
.. versionchanged:: 3.6
|
||||
*level* can now be used as a keyword parameter.
|
||||
|
||||
.. versionchanged:: 3.11
|
||||
The *wbits* parameter is now available to set window bits and
|
||||
compression type.
|
||||
|
||||
.. function:: compressobj(level=-1, method=DEFLATED, wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=Z_DEFAULT_STRATEGY[, zdict])
|
||||
|
||||
Returns a compression object, to be used for compressing data streams that won't
|
||||
fit into memory at once.
|
||||
|
||||
*level* is the compression level -- an integer from ``0`` to ``9`` or ``-1``.
|
||||
A value of ``1`` (Z_BEST_SPEED) is fastest and produces the least compression,
|
||||
while a value of ``9`` (Z_BEST_COMPRESSION) is slowest and produces the most.
|
||||
``0`` (Z_NO_COMPRESSION) is no compression. The default value is ``-1`` (Z_DEFAULT_COMPRESSION).
|
||||
Z_DEFAULT_COMPRESSION represents a default compromise between speed and compression
|
||||
(currently equivalent to level 6).
|
||||
|
||||
*method* is the compression algorithm. Currently, the only supported value is
|
||||
:const:`DEFLATED`.
|
||||
|
||||
The *wbits* parameter controls the size of the history buffer (or the
|
||||
"window size"), and what header and trailer format will be used. It has
|
||||
the same meaning as `described for compress() <#compress-wbits>`__.
|
||||
|
||||
The *memLevel* argument controls the amount of memory used for the
|
||||
internal compression state. Valid values range from ``1`` to ``9``.
|
||||
Higher values use more memory, but are faster and produce smaller output.
|
||||
|
|
161
Lib/gzip.py
161
Lib/gzip.py
|
@ -403,6 +403,59 @@ def __iter__(self):
|
|||
return self._buffer.__iter__()
|
||||
|
||||
|
||||
def _read_exact(fp, n):
|
||||
'''Read exactly *n* bytes from `fp`
|
||||
|
||||
This method is required because fp may be unbuffered,
|
||||
i.e. return short reads.
|
||||
'''
|
||||
data = fp.read(n)
|
||||
while len(data) < n:
|
||||
b = fp.read(n - len(data))
|
||||
if not b:
|
||||
raise EOFError("Compressed file ended before the "
|
||||
"end-of-stream marker was reached")
|
||||
data += b
|
||||
return data
|
||||
|
||||
|
||||
def _read_gzip_header(fp):
|
||||
'''Read a gzip header from `fp` and progress to the end of the header.
|
||||
|
||||
Returns last mtime if header was present or None otherwise.
|
||||
'''
|
||||
magic = fp.read(2)
|
||||
if magic == b'':
|
||||
return None
|
||||
|
||||
if magic != b'\037\213':
|
||||
raise BadGzipFile('Not a gzipped file (%r)' % magic)
|
||||
|
||||
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
|
||||
if method != 8:
|
||||
raise BadGzipFile('Unknown compression method')
|
||||
|
||||
if flag & FEXTRA:
|
||||
# Read & discard the extra field, if present
|
||||
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
|
||||
_read_exact(fp, extra_len)
|
||||
if flag & FNAME:
|
||||
# Read and discard a null-terminated string containing the filename
|
||||
while True:
|
||||
s = fp.read(1)
|
||||
if not s or s==b'\000':
|
||||
break
|
||||
if flag & FCOMMENT:
|
||||
# Read and discard a null-terminated string containing a comment
|
||||
while True:
|
||||
s = fp.read(1)
|
||||
if not s or s==b'\000':
|
||||
break
|
||||
if flag & FHCRC:
|
||||
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
|
||||
return last_mtime
|
||||
|
||||
|
||||
class _GzipReader(_compression.DecompressReader):
|
||||
def __init__(self, fp):
|
||||
super().__init__(_PaddedFile(fp), zlib.decompressobj,
|
||||
|
@ -415,53 +468,11 @@ def _init_read(self):
|
|||
self._crc = zlib.crc32(b"")
|
||||
self._stream_size = 0 # Decompressed size of unconcatenated stream
|
||||
|
||||
def _read_exact(self, n):
|
||||
'''Read exactly *n* bytes from `self._fp`
|
||||
|
||||
This method is required because self._fp may be unbuffered,
|
||||
i.e. return short reads.
|
||||
'''
|
||||
|
||||
data = self._fp.read(n)
|
||||
while len(data) < n:
|
||||
b = self._fp.read(n - len(data))
|
||||
if not b:
|
||||
raise EOFError("Compressed file ended before the "
|
||||
"end-of-stream marker was reached")
|
||||
data += b
|
||||
return data
|
||||
|
||||
def _read_gzip_header(self):
|
||||
magic = self._fp.read(2)
|
||||
if magic == b'':
|
||||
last_mtime = _read_gzip_header(self._fp)
|
||||
if last_mtime is None:
|
||||
return False
|
||||
|
||||
if magic != b'\037\213':
|
||||
raise BadGzipFile('Not a gzipped file (%r)' % magic)
|
||||
|
||||
(method, flag,
|
||||
self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
|
||||
if method != 8:
|
||||
raise BadGzipFile('Unknown compression method')
|
||||
|
||||
if flag & FEXTRA:
|
||||
# Read & discard the extra field, if present
|
||||
extra_len, = struct.unpack("<H", self._read_exact(2))
|
||||
self._read_exact(extra_len)
|
||||
if flag & FNAME:
|
||||
# Read and discard a null-terminated string containing the filename
|
||||
while True:
|
||||
s = self._fp.read(1)
|
||||
if not s or s==b'\000':
|
||||
break
|
||||
if flag & FCOMMENT:
|
||||
# Read and discard a null-terminated string containing a comment
|
||||
while True:
|
||||
s = self._fp.read(1)
|
||||
if not s or s==b'\000':
|
||||
break
|
||||
if flag & FHCRC:
|
||||
self._read_exact(2) # Read & discard the 16-bit header CRC
|
||||
self._last_mtime = last_mtime
|
||||
return True
|
||||
|
||||
def read(self, size=-1):
|
||||
|
@ -524,7 +535,7 @@ def _read_eof(self):
|
|||
# We check that the computed CRC and size of the
|
||||
# uncompressed data matches the stored values. Note that the size
|
||||
# stored is the true file size mod 2**32.
|
||||
crc32, isize = struct.unpack("<II", self._read_exact(8))
|
||||
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
|
||||
if crc32 != self._crc:
|
||||
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
|
||||
hex(self._crc)))
|
||||
|
@ -544,21 +555,65 @@ def _rewind(self):
|
|||
super()._rewind()
|
||||
self._new_member = True
|
||||
|
||||
|
||||
def _create_simple_gzip_header(compresslevel: int,
|
||||
mtime = None) -> bytes:
|
||||
"""
|
||||
Write a simple gzip header with no extra fields.
|
||||
:param compresslevel: Compresslevel used to determine the xfl bytes.
|
||||
:param mtime: The mtime (must support conversion to a 32-bit integer).
|
||||
:return: A bytes object representing the gzip header.
|
||||
"""
|
||||
if mtime is None:
|
||||
mtime = time.time()
|
||||
if compresslevel == _COMPRESS_LEVEL_BEST:
|
||||
xfl = 2
|
||||
elif compresslevel == _COMPRESS_LEVEL_FAST:
|
||||
xfl = 4
|
||||
else:
|
||||
xfl = 0
|
||||
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
|
||||
# fields added to header), mtime, xfl and os (255 for unknown OS).
|
||||
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
|
||||
|
||||
|
||||
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
|
||||
"""Compress data in one shot and return the compressed string.
|
||||
Optional argument is the compression level, in range of 0-9.
|
||||
|
||||
compresslevel sets the compression level in range of 0-9.
|
||||
mtime can be used to set the modification time. The modification time is
|
||||
set to the current time by default.
|
||||
"""
|
||||
buf = io.BytesIO()
|
||||
with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
|
||||
f.write(data)
|
||||
return buf.getvalue()
|
||||
if mtime == 0:
|
||||
# Use zlib as it creates the header with 0 mtime by default.
|
||||
# This is faster and with less overhead.
|
||||
return zlib.compress(data, level=compresslevel, wbits=31)
|
||||
header = _create_simple_gzip_header(compresslevel, mtime)
|
||||
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
|
||||
# Wbits=-15 creates a raw deflate block.
|
||||
return header + zlib.compress(data, wbits=-15) + trailer
|
||||
|
||||
|
||||
def decompress(data):
|
||||
"""Decompress a gzip compressed string in one shot.
|
||||
Return the decompressed string.
|
||||
"""
|
||||
with GzipFile(fileobj=io.BytesIO(data)) as f:
|
||||
return f.read()
|
||||
decompressed_members = []
|
||||
while True:
|
||||
fp = io.BytesIO(data)
|
||||
if _read_gzip_header(fp) is None:
|
||||
return b"".join(decompressed_members)
|
||||
# Use a zlib raw deflate compressor
|
||||
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
|
||||
# Read all the data except the header
|
||||
decompressed = do.decompress(data[fp.tell():])
|
||||
crc, length = struct.unpack("<II", do.unused_data[:8])
|
||||
if crc != zlib.crc32(decompressed):
|
||||
raise BadGzipFile("CRC check failed")
|
||||
if length != (len(decompressed) & 0xffffffff):
|
||||
raise BadGzipFile("Incorrect length of data produced")
|
||||
decompressed_members.append(decompressed)
|
||||
data = do.unused_data[8:].lstrip(b"\x00")
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -831,6 +831,13 @@ def test_wbits(self):
|
|||
dco = zlib.decompressobj(32 + 15)
|
||||
self.assertEqual(dco.decompress(gzip), HAMLET_SCENE)
|
||||
|
||||
for wbits in (-15, 15, 31):
|
||||
with self.subTest(wbits=wbits):
|
||||
expected = HAMLET_SCENE
|
||||
actual = zlib.decompress(
|
||||
zlib.compress(HAMLET_SCENE, wbits=wbits), wbits=wbits
|
||||
)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def choose_lines(source, number, seed=None, generator=random):
|
||||
"""Return a list of number lines randomly chosen from the source"""
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
:func:`zlib.compress` now accepts a wbits parameter which allows users to
|
||||
compress data as a raw deflate block without zlib headers and trailers in
|
||||
one go. Previously this required instantiating a ``zlib.compressobj``. It
|
||||
also provides a faster alternative to ``gzip.compress`` when wbits=31 is
|
||||
used.
|
|
@ -0,0 +1,3 @@
|
|||
Improve the speed of :func:`gzip.compress` and :func:`gzip.decompress` by
|
||||
compressing and decompressing at once in memory instead of in a streamed
|
||||
fashion.
|
32
Modules/clinic/zlibmodule.c.h
generated
32
Modules/clinic/zlibmodule.c.h
generated
|
@ -3,7 +3,7 @@ preserve
|
|||
[clinic start generated code]*/
|
||||
|
||||
PyDoc_STRVAR(zlib_compress__doc__,
|
||||
"compress($module, data, /, level=Z_DEFAULT_COMPRESSION)\n"
|
||||
"compress($module, data, /, level=Z_DEFAULT_COMPRESSION, wbits=MAX_WBITS)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Returns a bytes object containing compressed data.\n"
|
||||
|
@ -11,26 +11,29 @@ PyDoc_STRVAR(zlib_compress__doc__,
|
|||
" data\n"
|
||||
" Binary data to be compressed.\n"
|
||||
" level\n"
|
||||
" Compression level, in 0-9 or -1.");
|
||||
" Compression level, in 0-9 or -1.\n"
|
||||
" wbits\n"
|
||||
" The window buffer size and container format.");
|
||||
|
||||
#define ZLIB_COMPRESS_METHODDEF \
|
||||
{"compress", (PyCFunction)(void(*)(void))zlib_compress, METH_FASTCALL|METH_KEYWORDS, zlib_compress__doc__},
|
||||
|
||||
static PyObject *
|
||||
zlib_compress_impl(PyObject *module, Py_buffer *data, int level);
|
||||
zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits);
|
||||
|
||||
static PyObject *
|
||||
zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
static const char * const _keywords[] = {"", "level", NULL};
|
||||
static const char * const _keywords[] = {"", "level", "wbits", NULL};
|
||||
static _PyArg_Parser _parser = {NULL, _keywords, "compress", 0};
|
||||
PyObject *argsbuf[2];
|
||||
PyObject *argsbuf[3];
|
||||
Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
|
||||
Py_buffer data = {NULL, NULL};
|
||||
int level = Z_DEFAULT_COMPRESSION;
|
||||
int wbits = MAX_WBITS;
|
||||
|
||||
args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf);
|
||||
args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 3, 0, argsbuf);
|
||||
if (!args) {
|
||||
goto exit;
|
||||
}
|
||||
|
@ -44,12 +47,21 @@ zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
|
|||
if (!noptargs) {
|
||||
goto skip_optional_pos;
|
||||
}
|
||||
level = _PyLong_AsInt(args[1]);
|
||||
if (level == -1 && PyErr_Occurred()) {
|
||||
if (args[1]) {
|
||||
level = _PyLong_AsInt(args[1]);
|
||||
if (level == -1 && PyErr_Occurred()) {
|
||||
goto exit;
|
||||
}
|
||||
if (!--noptargs) {
|
||||
goto skip_optional_pos;
|
||||
}
|
||||
}
|
||||
wbits = _PyLong_AsInt(args[2]);
|
||||
if (wbits == -1 && PyErr_Occurred()) {
|
||||
goto exit;
|
||||
}
|
||||
skip_optional_pos:
|
||||
return_value = zlib_compress_impl(module, &data, level);
|
||||
return_value = zlib_compress_impl(module, &data, level, wbits);
|
||||
|
||||
exit:
|
||||
/* Cleanup for data */
|
||||
|
@ -803,4 +815,4 @@ exit:
|
|||
#ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
|
||||
#define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
|
||||
#endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
|
||||
/*[clinic end generated code: output=6736bae59fab268b input=a9049054013a1b77]*/
|
||||
/*[clinic end generated code: output=e3e8a6142ea045a7 input=a9049054013a1b77]*/
|
||||
|
|
|
@ -310,13 +310,15 @@ zlib.compress
|
|||
/
|
||||
level: int(c_default="Z_DEFAULT_COMPRESSION") = Z_DEFAULT_COMPRESSION
|
||||
Compression level, in 0-9 or -1.
|
||||
wbits: int(c_default="MAX_WBITS") = MAX_WBITS
|
||||
The window buffer size and container format.
|
||||
|
||||
Returns a bytes object containing compressed data.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
zlib_compress_impl(PyObject *module, Py_buffer *data, int level)
|
||||
/*[clinic end generated code: output=d80906d73f6294c8 input=638d54b6315dbed3]*/
|
||||
zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
|
||||
/*[clinic end generated code: output=46bd152fadd66df2 input=c4d06ee5782a7e3f]*/
|
||||
{
|
||||
PyObject *RetVal;
|
||||
int flush;
|
||||
|
@ -336,7 +338,8 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level)
|
|||
zst.zalloc = PyZlib_Malloc;
|
||||
zst.zfree = PyZlib_Free;
|
||||
zst.next_in = ibuf;
|
||||
int err = deflateInit(&zst, level);
|
||||
int err = deflateInit2(&zst, level, DEFLATED, wbits, DEF_MEM_LEVEL,
|
||||
Z_DEFAULT_STRATEGY);
|
||||
|
||||
switch (err) {
|
||||
case Z_OK:
|
||||
|
|
Loading…
Reference in a new issue