bpo-38256: Fix binascii.crc32() when inputs are 4+GiB (GH-32000)

When compiled with `USE_ZLIB_CRC32` defined (`configure` sets this on POSIX systems), `binascii.crc32(...)` failed to compute the correct value when the input data was >= 4GiB. Because the zlib crc32 API is limited to a 32-bit length.

This lines it up with the `zlib.crc32(...)` implementation that doesn't have that flaw.

**Performance:** This also adopts the same GIL releasing for larger inputs logic that `zlib.crc32` has, and causes the Windows build to always use zlib's crc32 instead of our slow C code as zlib is a required build dependency on Windows.
This commit is contained in:
Gregory P. Smith 2022-03-20 12:28:15 -07:00 committed by GitHub
parent 3ae975f1ac
commit 9d1c4d69db
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 87 additions and 31 deletions

View file

@ -4,7 +4,7 @@
import binascii import binascii
import array import array
import re import re
from test.support import warnings_helper from test.support import bigmemtest, _1G, _4G, warnings_helper
# Note: "*_hex" functions are aliases for "(un)hexlify" # Note: "*_hex" functions are aliases for "(un)hexlify"
@ -441,6 +441,14 @@ class BytearrayBinASCIITest(BinASCIITest):
class MemoryviewBinASCIITest(BinASCIITest): class MemoryviewBinASCIITest(BinASCIITest):
type2test = memoryview type2test = memoryview
class ChecksumBigBufferTestCase(unittest.TestCase):
"""bpo-38256 - check that inputs >=4 GiB are handled correctly."""
@bigmemtest(size=_4G + 4, memuse=1, dry_run=False)
def test_big_buffer(self, size):
data = b"nyan" * (_1G + 1)
self.assertEqual(binascii.crc32(data), 1044521549)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View file

@ -0,0 +1,14 @@
Fix :func:`binascii.crc32` when it is compiled to use zlib'c crc32 to
work properly on inputs 4+GiB in length instead of returning the wrong
result. The workaround prior to this was to always feed the function
data in increments smaller than 4GiB or to just call the zlib module
function.
We also have :func:`binascii.crc32` release the GIL when computing
on larger inputs as :func:`zlib.crc32` and :mod:`hashlib` do.
This also boosts performance on Windows as it now uses the zlib crc32
implementation for :func:`binascii.crc32` for a 2-3x speedup.
That the stdlib has a crc32 API in two modules is a known historical
oddity. This moves us closer to a single implementation behind them.

View file

@ -737,6 +737,21 @@ static const unsigned int crc_32_tab[256] = {
0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU, 0x5d681b02U, 0x2a6f2b94U, 0xb40bbe37U, 0xc30c8ea1U, 0x5a05df1bU,
0x2d02ef8dU 0x2d02ef8dU
}; };
static unsigned int
internal_crc32(const unsigned char *bin_data, Py_ssize_t len, unsigned int crc)
{ /* By Jim Ahlstrom; All rights transferred to CNRI */
unsigned int result;
crc = ~ crc;
while (len-- > 0) {
crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8);
/* Note: (crc >> 8) MUST zero fill on left */
}
result = (crc ^ 0xFFFFFFFF);
return result & 0xffffffff;
}
#endif /* USE_ZLIB_CRC32 */ #endif /* USE_ZLIB_CRC32 */
/*[clinic input] /*[clinic input]
@ -754,34 +769,46 @@ binascii_crc32_impl(PyObject *module, Py_buffer *data, unsigned int crc)
/*[clinic end generated code: output=52cf59056a78593b input=bbe340bc99d25aa8]*/ /*[clinic end generated code: output=52cf59056a78593b input=bbe340bc99d25aa8]*/
#ifdef USE_ZLIB_CRC32 #ifdef USE_ZLIB_CRC32
/* This was taken from zlibmodule.c PyZlib_crc32 (but is PY_SSIZE_T_CLEAN) */ /* This is the same as zlibmodule.c zlib_crc32_impl. It exists in two
* modules for historical reasons. */
{ {
const Byte *buf; /* Releasing the GIL for very small buffers is inefficient
Py_ssize_t len; and may lower performance */
int signed_val; if (data->len > 1024*5) {
unsigned char *buf = data->buf;
Py_ssize_t len = data->len;
buf = (Byte*)data->buf; Py_BEGIN_ALLOW_THREADS
len = data->len; /* Avoid truncation of length for very large buffers. crc32() takes
signed_val = crc32(crc, buf, len); length as an unsigned int, which may be narrower than Py_ssize_t. */
return (unsigned int)signed_val & 0xffffffffU; while ((size_t)len > UINT_MAX) {
crc = crc32(crc, buf, UINT_MAX);
buf += (size_t) UINT_MAX;
len -= (size_t) UINT_MAX;
}
crc = crc32(crc, buf, (unsigned int)len);
Py_END_ALLOW_THREADS
} else {
crc = crc32(crc, data->buf, (unsigned int)data->len);
}
return crc & 0xffffffff;
} }
#else /* USE_ZLIB_CRC32 */ #else /* USE_ZLIB_CRC32 */
{ /* By Jim Ahlstrom; All rights transferred to CNRI */ {
const unsigned char *bin_data; const unsigned char *bin_data = data->buf;
Py_ssize_t len; Py_ssize_t len = data->len;
unsigned int result;
bin_data = data->buf; /* Releasing the GIL for very small buffers is inefficient
len = data->len; and may lower performance */
if (len > 1024*5) {
crc = ~ crc; unsigned int result;
while (len-- > 0) { Py_BEGIN_ALLOW_THREADS
crc = crc_32_tab[(crc ^ *bin_data++) & 0xff] ^ (crc >> 8); result = internal_crc32(bin_data, len, crc);
/* Note: (crc >> 8) MUST zero fill on left */ Py_END_ALLOW_THREADS
return result;
} else {
return internal_crc32(bin_data, len, crc);
} }
result = (crc ^ 0xFFFFFFFF);
return result & 0xffffffff;
} }
#endif /* USE_ZLIB_CRC32 */ #endif /* USE_ZLIB_CRC32 */

View file

@ -753,7 +753,7 @@ PyDoc_STRVAR(zlib_crc32__doc__,
#define ZLIB_CRC32_METHODDEF \ #define ZLIB_CRC32_METHODDEF \
{"crc32", (PyCFunction)(void(*)(void))zlib_crc32, METH_FASTCALL, zlib_crc32__doc__}, {"crc32", (PyCFunction)(void(*)(void))zlib_crc32, METH_FASTCALL, zlib_crc32__doc__},
static PyObject * static unsigned int
zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value); zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value);
static PyObject * static PyObject *
@ -762,6 +762,7 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
PyObject *return_value = NULL; PyObject *return_value = NULL;
Py_buffer data = {NULL, NULL}; Py_buffer data = {NULL, NULL};
unsigned int value = 0; unsigned int value = 0;
unsigned int _return_value;
if (!_PyArg_CheckPositional("crc32", nargs, 1, 2)) { if (!_PyArg_CheckPositional("crc32", nargs, 1, 2)) {
goto exit; goto exit;
@ -781,7 +782,11 @@ zlib_crc32(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
goto exit; goto exit;
} }
skip_optional: skip_optional:
return_value = zlib_crc32_impl(module, &data, value); _return_value = zlib_crc32_impl(module, &data, value);
if ((_return_value == (unsigned int)-1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyLong_FromUnsignedLong((unsigned long)_return_value);
exit: exit:
/* Cleanup for data */ /* Cleanup for data */
@ -815,4 +820,4 @@ exit:
#ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */ #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
/*[clinic end generated code: output=e3e8a6142ea045a7 input=a9049054013a1b77]*/ /*[clinic end generated code: output=f009d194565416d1 input=a9049054013a1b77]*/

View file

@ -1420,7 +1420,7 @@ zlib_adler32_impl(PyObject *module, Py_buffer *data, unsigned int value)
} }
/*[clinic input] /*[clinic input]
zlib.crc32 zlib.crc32 -> unsigned_int
data: Py_buffer data: Py_buffer
value: unsigned_int(bitwise=True) = 0 value: unsigned_int(bitwise=True) = 0
@ -1432,9 +1432,9 @@ Compute a CRC-32 checksum of data.
The returned checksum is an integer. The returned checksum is an integer.
[clinic start generated code]*/ [clinic start generated code]*/
static PyObject * static unsigned int
zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value) zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
/*[clinic end generated code: output=63499fa20af7ea25 input=26c3ed430fa00b4c]*/ /*[clinic end generated code: output=b217562e4fe6d6a6 input=1229cb2fb5ea948a]*/
{ {
/* Releasing the GIL for very small buffers is inefficient /* Releasing the GIL for very small buffers is inefficient
and may lower performance */ and may lower performance */
@ -1455,7 +1455,7 @@ zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
} else { } else {
value = crc32(value, data->buf, (unsigned int)data->len); value = crc32(value, data->buf, (unsigned int)data->len);
} }
return PyLong_FromUnsignedLong(value & 0xffffffffU); return value;
} }

View file

@ -366,7 +366,9 @@
<ClCompile Include="..\Modules\arraymodule.c" /> <ClCompile Include="..\Modules\arraymodule.c" />
<ClCompile Include="..\Modules\atexitmodule.c" /> <ClCompile Include="..\Modules\atexitmodule.c" />
<ClCompile Include="..\Modules\audioop.c" /> <ClCompile Include="..\Modules\audioop.c" />
<ClCompile Include="..\Modules\binascii.c" /> <ClCompile Include="..\Modules\binascii.c">
<PreprocessorDefinitions>USE_ZLIB_CRC32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<ClCompile Include="..\Modules\cmathmodule.c" /> <ClCompile Include="..\Modules\cmathmodule.c" />
<ClCompile Include="..\Modules\_datetimemodule.c" /> <ClCompile Include="..\Modules\_datetimemodule.c" />
<ClCompile Include="..\Modules\errnomodule.c" /> <ClCompile Include="..\Modules\errnomodule.c" />