diff --git a/Include/Python.h b/Include/Python.h index ae384ee0153..01b98f9b138 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -101,7 +101,7 @@ #include "warnings.h" #include "weakrefobject.h" #include "structseq.h" - +#include "accu.h" #include "codecs.h" #include "pyerrors.h" diff --git a/Include/accu.h b/Include/accu.h new file mode 100644 index 00000000000..9655d37bb1f --- /dev/null +++ b/Include/accu.h @@ -0,0 +1,35 @@ +#ifndef Py_LIMITED_API +#ifndef Py_ACCU_H +#define Py_ACCU_H + +/*** This is a private API for use by the interpreter and the stdlib. + *** Its definition may be changed or removed at any moment. + ***/ + +/* + * A two-level accumulator of unicode objects that avoids both the overhead + * of keeping a huge number of small separate objects, and the quadratic + * behaviour of using a naive repeated concatenation scheme. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + PyObject *large; /* A list of previously accumulated large strings */ + PyObject *small; /* Pending small strings */ +} _PyAccu; + +PyAPI_FUNC(int) _PyAccu_Init(_PyAccu *acc); +PyAPI_FUNC(int) _PyAccu_Accumulate(_PyAccu *acc, PyObject *unicode); +PyAPI_FUNC(PyObject *) _PyAccu_FinishAsList(_PyAccu *acc); +PyAPI_FUNC(PyObject *) _PyAccu_Finish(_PyAccu *acc); +PyAPI_FUNC(void) _PyAccu_Destroy(_PyAccu *acc); + +#ifdef __cplusplus +} +#endif + +#endif /* Py_ACCU_H */ +#endif /* Py_LIMITED_API */ diff --git a/Lib/test/test_list.py b/Lib/test/test_list.py index 3510bd5febe..d7e6629ba7c 100644 --- a/Lib/test/test_list.py +++ b/Lib/test/test_list.py @@ -59,6 +59,17 @@ def imul(a, b): a *= b self.assertRaises((MemoryError, OverflowError), mul, lst, n) self.assertRaises((MemoryError, OverflowError), imul, lst, n) + def test_repr_large(self): + # Check the repr of large list objects + def check(n): + l = [0] * n + s = repr(l) + self.assertEqual(s, + '[' + ', '.join(['0'] * n) + ']') + check(10) # check our checking code + check(1000000) + + def test_main(verbose=None): support.run_unittest(ListTest) diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py index 75fbe45ec4b..1464a0e50f4 100644 --- a/Lib/test/test_tuple.py +++ b/Lib/test/test_tuple.py @@ -154,6 +154,16 @@ def test_bug7466(self): # Trying to untrack an unfinished tuple could crash Python self._not_tracked(tuple(gc.collect() for i in range(101))) + def test_repr_large(self): + # Check the repr of large list objects + def check(n): + l = (0,) * n + s = repr(l) + self.assertEqual(s, + '(' + ', '.join(['0'] * n) + ')') + check(10) # check our checking code + check(1000000) + def test_main(): support.run_unittest(TupleTest) diff --git a/Makefile.pre.in b/Makefile.pre.in index 52743e434ab..51da7a55af6 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -342,6 +342,7 @@ PYTHON_OBJS= \ # Objects OBJECT_OBJS= \ Objects/abstract.o \ + Objects/accu.o \ Objects/boolobject.o \ Objects/bytes_methods.o \ Objects/bytearrayobject.o \ @@ -661,6 +662,7 @@ $(srcdir)/Objects/typeslots.inc: $(srcdir)/Include/typeslots.h $(srcdir)/Objects PYTHON_HEADERS= \ Include/Python.h \ Include/abstract.h \ + Include/accu.h \ Include/asdl.h \ Include/ast.h \ Include/bltinmodule.h \ diff --git a/Misc/NEWS b/Misc/NEWS index 9564891fc13..af31f9f67dd 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #12911: Fix memory consumption when calculating the repr() of huge + tuples or lists. + - PEP 393: flexible string representation. Thanks to Torsten Becker for the initial implementation, and Victor Stinner for various bug fixes. diff --git a/Objects/accu.c b/Objects/accu.c new file mode 100644 index 00000000000..88e8f08f599 --- /dev/null +++ b/Objects/accu.c @@ -0,0 +1,114 @@ +/* Accumulator struct implementation */ + +#include "Python.h" + +static PyObject * +join_list_unicode(PyObject *lst) +{ + /* return ''.join(lst) */ + PyObject *sep, *ret; + sep = PyUnicode_FromStringAndSize("", 0); + ret = PyUnicode_Join(sep, lst); + Py_DECREF(sep); + return ret; +} + +int +_PyAccu_Init(_PyAccu *acc) +{ + /* Lazily allocated */ + acc->large = NULL; + acc->small = PyList_New(0); + if (acc->small == NULL) + return -1; + return 0; +} + +static int +flush_accumulator(_PyAccu *acc) +{ + Py_ssize_t nsmall = PyList_GET_SIZE(acc->small); + if (nsmall) { + int ret; + PyObject *joined; + if (acc->large == NULL) { + acc->large = PyList_New(0); + if (acc->large == NULL) + return -1; + } + joined = join_list_unicode(acc->small); + if (joined == NULL) + return -1; + if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) { + Py_DECREF(joined); + return -1; + } + ret = PyList_Append(acc->large, joined); + Py_DECREF(joined); + return ret; + } + return 0; +} + +int +_PyAccu_Accumulate(_PyAccu *acc, PyObject *unicode) +{ + Py_ssize_t nsmall; + assert(PyUnicode_Check(unicode)); + + if (PyList_Append(acc->small, unicode)) + return -1; + nsmall = PyList_GET_SIZE(acc->small); + /* Each item in a list of unicode objects has an overhead (in 64-bit + * builds) of: + * - 8 bytes for the list slot + * - 56 bytes for the header of the unicode object + * that is, 64 bytes. 100000 such objects waste more than 6MB + * compared to a single concatenated string. + */ + if (nsmall < 100000) + return 0; + return flush_accumulator(acc); +} + +PyObject * +_PyAccu_FinishAsList(_PyAccu *acc) +{ + int ret; + PyObject *res; + + ret = flush_accumulator(acc); + Py_CLEAR(acc->small); + if (ret) { + Py_CLEAR(acc->large); + return NULL; + } + res = acc->large; + acc->large = NULL; + return res; +} + +PyObject * +_PyAccu_Finish(_PyAccu *acc) +{ + PyObject *list, *res; + if (acc->large == NULL) { + list = acc->small; + acc->small = NULL; + } + else { + list = _PyAccu_FinishAsList(acc); + if (!list) + return NULL; + } + res = join_list_unicode(list); + Py_DECREF(list); + return res; +} + +void +_PyAccu_Destroy(_PyAccu *acc) +{ + Py_CLEAR(acc->small); + Py_CLEAR(acc->large); +} diff --git a/Objects/listobject.c b/Objects/listobject.c index 1c516e962b0..28d94e74b60 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -321,70 +321,59 @@ static PyObject * list_repr(PyListObject *v) { Py_ssize_t i; - PyObject *s, *temp; - PyObject *pieces = NULL, *result = NULL; + PyObject *s = NULL; + _PyAccu acc; + static PyObject *sep = NULL; + + if (Py_SIZE(v) == 0) { + return PyUnicode_FromString("[]"); + } + + if (sep == NULL) { + sep = PyUnicode_FromString(", "); + if (sep == NULL) + return NULL; + } i = Py_ReprEnter((PyObject*)v); if (i != 0) { return i > 0 ? PyUnicode_FromString("[...]") : NULL; } - if (Py_SIZE(v) == 0) { - result = PyUnicode_FromString("[]"); - goto Done; - } + if (_PyAccu_Init(&acc)) + goto error; - pieces = PyList_New(0); - if (pieces == NULL) - goto Done; + s = PyUnicode_FromString("["); + if (s == NULL || _PyAccu_Accumulate(&acc, s)) + goto error; + Py_CLEAR(s); /* Do repr() on each element. Note that this may mutate the list, so must refetch the list size on each iteration. */ for (i = 0; i < Py_SIZE(v); ++i) { - int status; if (Py_EnterRecursiveCall(" while getting the repr of a list")) - goto Done; + goto error; s = PyObject_Repr(v->ob_item[i]); Py_LeaveRecursiveCall(); - if (s == NULL) - goto Done; - status = PyList_Append(pieces, s); - Py_DECREF(s); /* append created a new ref */ - if (status < 0) - goto Done; + if (i > 0 && _PyAccu_Accumulate(&acc, sep)) + goto error; + if (s == NULL || _PyAccu_Accumulate(&acc, s)) + goto error; + Py_CLEAR(s); } - - /* Add "[]" decorations to the first and last items. */ - assert(PyList_GET_SIZE(pieces) > 0); - s = PyUnicode_FromString("["); - if (s == NULL) - goto Done; - temp = PyList_GET_ITEM(pieces, 0); - PyUnicode_AppendAndDel(&s, temp); - PyList_SET_ITEM(pieces, 0, s); - if (s == NULL) - goto Done; - s = PyUnicode_FromString("]"); - if (s == NULL) - goto Done; - temp = PyList_GET_ITEM(pieces, PyList_GET_SIZE(pieces) - 1); - PyUnicode_AppendAndDel(&temp, s); - PyList_SET_ITEM(pieces, PyList_GET_SIZE(pieces) - 1, temp); - if (temp == NULL) - goto Done; + if (s == NULL || _PyAccu_Accumulate(&acc, s)) + goto error; + Py_CLEAR(s); - /* Paste them all together with ", " between. */ - s = PyUnicode_FromString(", "); - if (s == NULL) - goto Done; - result = PyUnicode_Join(s, pieces); - Py_DECREF(s); - -Done: - Py_XDECREF(pieces); Py_ReprLeave((PyObject *)v); - return result; + return _PyAccu_Finish(&acc); + +error: + _PyAccu_Destroy(&acc); + Py_XDECREF(s); + Py_ReprLeave((PyObject *)v); + return NULL; } static Py_ssize_t diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c index ddb69e48507..54a580d4a33 100644 --- a/Objects/tupleobject.c +++ b/Objects/tupleobject.c @@ -240,13 +240,20 @@ static PyObject * tuplerepr(PyTupleObject *v) { Py_ssize_t i, n; - PyObject *s, *temp; - PyObject *pieces, *result = NULL; + PyObject *s = NULL; + _PyAccu acc; + static PyObject *sep = NULL; n = Py_SIZE(v); if (n == 0) return PyUnicode_FromString("()"); + if (sep == NULL) { + sep = PyUnicode_FromString(", "); + if (sep == NULL) + return NULL; + } + /* While not mutable, it is still possible to end up with a cycle in a tuple through an object that stores itself within a tuple (and thus infinitely asks for the repr of itself). This should only be @@ -256,52 +263,42 @@ tuplerepr(PyTupleObject *v) return i > 0 ? PyUnicode_FromString("(...)") : NULL; } - pieces = PyTuple_New(n); - if (pieces == NULL) - return NULL; + if (_PyAccu_Init(&acc)) + goto error; + + s = PyUnicode_FromString("("); + if (s == NULL || _PyAccu_Accumulate(&acc, s)) + goto error; + Py_CLEAR(s); /* Do repr() on each element. */ for (i = 0; i < n; ++i) { if (Py_EnterRecursiveCall(" while getting the repr of a tuple")) - goto Done; + goto error; s = PyObject_Repr(v->ob_item[i]); Py_LeaveRecursiveCall(); - if (s == NULL) - goto Done; - PyTuple_SET_ITEM(pieces, i, s); + if (i > 0 && _PyAccu_Accumulate(&acc, sep)) + goto error; + if (s == NULL || _PyAccu_Accumulate(&acc, s)) + goto error; + Py_CLEAR(s); } + if (n > 1) + s = PyUnicode_FromString(")"); + else + s = PyUnicode_FromString(",)"); + if (s == NULL || _PyAccu_Accumulate(&acc, s)) + goto error; + Py_CLEAR(s); - /* Add "()" decorations to the first and last items. */ - assert(n > 0); - s = PyUnicode_FromString("("); - if (s == NULL) - goto Done; - temp = PyTuple_GET_ITEM(pieces, 0); - PyUnicode_AppendAndDel(&s, temp); - PyTuple_SET_ITEM(pieces, 0, s); - if (s == NULL) - goto Done; - - s = PyUnicode_FromString(n == 1 ? ",)" : ")"); - if (s == NULL) - goto Done; - temp = PyTuple_GET_ITEM(pieces, n-1); - PyUnicode_AppendAndDel(&temp, s); - PyTuple_SET_ITEM(pieces, n-1, temp); - if (temp == NULL) - goto Done; - - /* Paste them all together with ", " between. */ - s = PyUnicode_FromString(", "); - if (s == NULL) - goto Done; - result = PyUnicode_Join(s, pieces); - Py_DECREF(s); - -Done: - Py_DECREF(pieces); Py_ReprLeave((PyObject *)v); - return result; + return _PyAccu_Finish(&acc); + +error: + _PyAccu_Destroy(&acc); + Py_XDECREF(s); + Py_ReprLeave((PyObject *)v); + return NULL; } /* The addend 82520, was selected from the range(0, 1000000) for diff --git a/PC/VC6/pythoncore.dsp b/PC/VC6/pythoncore.dsp index 60074ed31a3..61247b7f726 100644 --- a/PC/VC6/pythoncore.dsp +++ b/PC/VC6/pythoncore.dsp @@ -205,6 +205,10 @@ SOURCE=..\..\Objects\abstract.c # End Source File # Begin Source File +SOURCE=..\..\Objects\accu.c +# End Source File +# Begin Source File + SOURCE=..\..\Parser\acceler.c # End Source File # Begin Source File diff --git a/PC/VS7.1/pythoncore.vcproj b/PC/VS7.1/pythoncore.vcproj index 205dd8fb690..d882911a684 100644 --- a/PC/VS7.1/pythoncore.vcproj +++ b/PC/VS7.1/pythoncore.vcproj @@ -444,6 +444,9 @@ + + diff --git a/PC/VS8.0/pythoncore.vcproj b/PC/VS8.0/pythoncore.vcproj index 7252bcd603f..5e1ef1ac19c 100644 --- a/PC/VS8.0/pythoncore.vcproj +++ b/PC/VS8.0/pythoncore.vcproj @@ -634,6 +634,10 @@ RelativePath="..\..\Include\abstract.h" > + + @@ -1446,6 +1450,10 @@ RelativePath="..\..\Objects\abstract.c" > + + diff --git a/PCbuild/pythoncore.vcproj b/PCbuild/pythoncore.vcproj index 7e9f472d69f..69a8ca8c7c8 100644 --- a/PCbuild/pythoncore.vcproj +++ b/PCbuild/pythoncore.vcproj @@ -634,6 +634,10 @@ RelativePath="..\Include\abstract.h" > + + @@ -1454,6 +1458,10 @@ RelativePath="..\Objects\abstract.c" > + +