bpo-38256: Fix binascii.crc32() when inputs are 4+GiB (GH-32000)

When compiled with `USE_ZLIB_CRC32` defined (`configure` sets this on POSIX systems), `binascii.crc32(...)` failed to compute the correct value when the input data was >= 4GiB. Because the zlib crc32 API is limited to a 32-bit length.

This lines it up with the `zlib.crc32(...)` implementation that doesn't have that flaw.

**Performance:** This also adopts the same GIL releasing for larger inputs logic that `zlib.crc32` has, and causes the Windows build to always use zlib's crc32 instead of our slow C code as zlib is a required build dependency on Windows.
This commit is contained in:
Gregory P. Smith 2022-03-20 12:28:15 -07:00 committed by GitHub
parent 3ae975f1ac
commit 9d1c4d69db
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 87 additions and 31 deletions

View file

@ -4,7 +4,7 @@
import binascii
import array
import re
from test.support import warnings_helper
from test.support import bigmemtest, _1G, _4G, warnings_helper
# Note: "*_hex" functions are aliases for "(un)hexlify"
@ -441,6 +441,14 @@ class BytearrayBinASCIITest(BinASCIITest):
class MemoryviewBinASCIITest(BinASCIITest):
type2test = memoryview
class ChecksumBigBufferTestCase(unittest.TestCase):
"""bpo-38256 - check that inputs >=4 GiB are handled correctly."""
@bigmemtest(size=_4G + 4, memuse=1, dry_run=False)
def test_big_buffer(self, size):
data = b"nyan" * (_1G + 1)
self.assertEqual(binascii.crc32(data), 1044521549)
if __name__ == "__main__":
unittest.main()

View file

@ -0,0 +1,14 @@
Fix :func:`binascii.crc32` when it is compiled to use zlib'c crc32 to
work properly on inputs 4+GiB in length instead of returning the wrong
result. The workaround prior to this was to always feed the function
data in increments smaller than 4GiB or to just call the zlib module
function.
We also have :func:`binascii.crc32` release the GIL when computing
on larger inputs as :func:`zlib.crc32` and :mod:`hashlib` do.
This also boosts performance on Windows as it now uses the zlib crc32
implementation for :func:`binascii.crc32` for a 2-3x speedup.
That the stdlib has a crc32 API in two modules is a known historical
oddity. This moves us closer to a single implementation behind them.

View file

@ -737,6 +737,21 @@ static const unsigned int crc_32_tab[256] = {
0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
0x2d02ef8dU
};
static unsigned int
internal_crc32(const unsigned char *bin_data, Py_ssize_t len, unsigned int crc)
{ /* By Jim Ahlstrom; All rights transferred to CNRI */
unsigned int result;
crc = ~ crc;
while (len-- > 0) {
crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
/* Note: (crc >> 8) MUST zero fill on left */
}
result = (crc ^ 0xFFFFFFFF);
return result & 0xffffffff;
}
#endif /* USE_ZLIB_CRC32 */
/*[clinic input]
@ -754,34 +769,46 @@ binascii_crc32_impl(PyObject *module, Py_buffer *data, unsigned int crc)
/*[clinic end generated code: output=52cf59056a78593b input=bbe340bc99d25aa8]*/
#ifdef USE_ZLIB_CRC32
/* This was taken from zlibmodule.c PyZlib_crc32 (but is PY_SSIZE_T_CLEAN) */
/* This is the same as zlibmodule.c zlib_crc32_impl. It exists in two
* modules for historical reasons. */
{
const Byte *buf;
Py_ssize_t len;
int signed_val;
/* Releasing the GIL for very small buffers is inefficient
and may lower performance */
if (data->len > 1024*5) {
unsigned char *buf = data->buf;
Py_ssize_t len = data->len;
buf = (Byte*)data->buf;
len = data->len;
signed_val = crc32(crc, buf, len);
return (unsigned int)signed_val & 0xffffffffU;
Py_BEGIN_ALLOW_THREADS
/* Avoid truncation of length for very large buffers. crc32() takes
length as an unsigned int, which may be narrower than Py_ssize_t. */
while ((size_t)len > UINT_MAX) {
crc = crc32(crc, buf, UINT_MAX);
buf += (size_t) UINT_MAX;
len -= (size_t) UINT_MAX;
}
crc = crc32(crc, buf, (unsigned int)len);
Py_END_ALLOW_THREADS
} else {
crc = crc32(crc, data->buf, (unsigned int)data->len);
}
return crc & 0xffffffff;
}
#else /* USE_ZLIB_CRC32 */
{ /* By Jim Ahlstrom; All rights transferred to CNRI */
const unsigned char *bin_data;
Py_ssize_t len;
unsigned int result;
{
const unsigned char *bin_data = data->buf;
Py_ssize_t len = data->len;
bin_data = data->buf;
len = data->len;
crc = ~ crc;
while (len-- > 0) {
crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
/* Note: (crc >> 8) MUST zero fill on left */
/* Releasing the GIL for very small buffers is inefficient
and may lower performance */
if (len > 1024*5) {
unsigned int result;
Py_BEGIN_ALLOW_THREADS
result = internal_crc32(bin_data, len, crc);
Py_END_ALLOW_THREADS
return result;
} else {
return internal_crc32(bin_data, len, crc);
}
result = (crc ^ 0xFFFFFFFF);
return result & 0xffffffff;
}
#endif /* USE_ZLIB_CRC32 */

View file

@ -753,7 +753,7 @@ PyDoc_STRVAR(zlib_crc32__doc__,
#define ZLIB_CRC32_METHODDEF \
{"crc32", (PyCFunction)(void(*)(void))zlib_crc32, METH_FASTCALL, zlib_crc32__doc__},
static PyObject *
static unsigned int
zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value);
static PyObject *
@ -762,6 +762,7 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
PyObject *return_value = NULL;
Py_buffer data = {NULL, NULL};
unsigned int value = 0;
unsigned int _return_value;
if (!_PyArg_CheckPositional("crc32", nargs, 1, 2)) {
goto exit;
@ -781,7 +782,11 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
goto exit;
}
skip_optional:
return_value = zlib_crc32_impl(module, &data, value);
_return_value = zlib_crc32_impl(module, &data, value);
if ((_return_value == (unsigned int)-1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyLong_FromUnsignedLong((unsigned long)_return_value);
exit:
/* Cleanup for data */
@ -815,4 +820,4 @@ exit:
#ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
/*[clinic end generated code: output=e3e8a6142ea045a7 input=a9049054013a1b77]*/
/*[clinic end generated code: output=f009d194565416d1 input=a9049054013a1b77]*/

View file

@ -1420,7 +1420,7 @@ zlib_adler32_impl(PyObject *module, Py_buffer *data, unsigned int value)
}
/*[clinic input]
zlib.crc32
zlib.crc32 -> unsigned_int
data: Py_buffer
value: unsigned_int(bitwise=True) = 0
@ -1432,9 +1432,9 @@ Compute a CRC-32 checksum of data.
The returned checksum is an integer.
[clinic start generated code]*/
static PyObject *
static unsigned int
zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
/*[clinic end generated code: output=63499fa20af7ea25 input=26c3ed430fa00b4c]*/
/*[clinic end generated code: output=b217562e4fe6d6a6 input=1229cb2fb5ea948a]*/
{
/* Releasing the GIL for very small buffers is inefficient
and may lower performance */
@ -1455,7 +1455,7 @@ zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
} else {
value = crc32(value, data->buf, (unsigned int)data->len);
}
return PyLong_FromUnsignedLong(value & 0xffffffffU);
return value;
}

View file

@ -366,7 +366,9 @@
<ClCompile Include="..\Modules\arraymodule.c" />
<ClCompile Include="..\Modules\atexitmodule.c" />
<ClCompile Include="..\Modules\audioop.c" />
<ClCompile Include="..\Modules\binascii.c" />
<ClCompile Include="..\Modules\binascii.c">
<PreprocessorDefinitions>USE_ZLIB_CRC32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<ClCompile Include="..\Modules\cmathmodule.c" />
<ClCompile Include="..\Modules\_datetimemodule.c" />
<ClCompile Include="..\Modules\errnomodule.c" />