diff --git a/Objects/listobject.c b/Objects/listobject.c index fc20a9bff3a..470ad8eb813 100644 --- a/Objects/listobject.c +++ b/Objects/listobject.c @@ -1628,6 +1628,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n) /* Avoid malloc for small temp arrays. */ #define MERGESTATE_TEMP_SIZE 256 +/* The largest value of minrun. This must be a power of 2, and >= 1, so that + * the compute_minrun() algorithm guarantees to return a result no larger than + * this, + */ +#define MAX_MINRUN 64 +#if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1)) +#error "MAX_MINRUN must be a power of 2, and >= 1" +#endif + /* One MergeState exists on the stack per invocation of mergesort. It's just * a convenient way to pass state around among the helper functions. */ @@ -1685,68 +1694,133 @@ struct s_MergeState { int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *); }; -/* binarysort is the best method for sorting small arrays: it does - few compares, but can do data movement quadratic in the number of - elements. - [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via - binary insertion. This sort is stable. - On entry, must have lo.keys <= start <= hi, and that - [lo.keys, start) is already sorted (pass start == lo.keys if you don't - know!). - If islt() complains return -1, else 0. +/* binarysort is the best method for sorting small arrays: it does few + compares, but can do data movement quadratic in the number of elements. + ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted. + Pass ok = 0 (or 1) if you don't know. + It's sorted in-place, by a stable binary insertion sort. If ss->values + isn't NULL, it's permuted in lockstap with ss->keys. + On entry, must have n >= 1, and 0 <= ok <= n <= MAX_MINRUN. + Return -1 if comparison raises an exception, else 0. Even in case of error, the output slice will be some permutation of the input (nothing is lost or duplicated). */ static int -binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start) +binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok) { - Py_ssize_t k; - PyObject **l, **p, **r; + Py_ssize_t k; /* for IFLT macro expansion */ + PyObject ** const a = ss->keys; + PyObject ** const v = ss->values; + const bool has_values = v != NULL; PyObject *pivot; + Py_ssize_t M; - assert(lo.keys <= start && start <= hi); - /* assert [lo.keys, start) is sorted */ - if (lo.keys == start) - ++start; - for (; start < hi; ++start) { - /* set l to where *start belongs */ - l = lo.keys; - r = start; - pivot = *r; - /* Invariants: - * pivot >= all in [lo.keys, l). - * pivot < all in [r, start). - * These are vacuously true at the start. - */ - assert(l < r); - do { - p = l + ((r - l) >> 1); - IFLT(pivot, *p) - r = p; + assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN); + /* assert a[:ok] is sorted */ + if (! ok) + ++ok; + /* Regular insertion sort has average- and worst-case O(n**2) cost + for both # of comparisons and number of bytes moved. But its branches + are highly predictable, and it loves sorted input (n-1 compares and no + data movement). This is significant in cases like sortperf.py's %sort, + where an out-of-order element near the start of a run is moved into + place slowly but then the remaining elements up to length minrun are + generally at worst one slot away from their correct position (so only + need 1 or 2 commpares to resolve). If comparisons are very fast (such + as for a list of Python floats), the simple inner loop leaves it + very competitive with binary insertion, despite that it does + significantly more compares overall on random data. + + Binary insertion sort has worst, average, and best case O(n log n) + cost for # of comparisons, but worst and average case O(n**2) cost + for data movement. The more expensive comparisons, the more important + the comparison advantage. But its branches are less predictable the + more "randomish" the data, and that's so significant its worst case + in real life is random input rather than reverse-ordered (which does + about twice the data movement than random input does). + + Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN + of 64 is so small that the key and value pointers all fit in a corner + of L1 cache, and moving things around in that is very fast. */ +#if 0 // ordinary insertion sort. + PyObject * vpivot = NULL; + for (; ok < n; ++ok) { + pivot = a[ok]; + if (has_values) + vpivot = v[ok]; + for (M = ok - 1; M >= 0; --M) { + k = ISLT(pivot, a[M]); + if (k < 0) { + a[M + 1] = pivot; + if (has_values) + v[M + 1] = vpivot; + goto fail; + } + else if (k) { + a[M + 1] = a[M]; + if (has_values) + v[M + 1] = v[M]; + } else - l = p+1; - } while (l < r); - assert(l == r); - /* The invariants still hold, so pivot >= all in [lo.keys, l) and - pivot < all in [l, start), so pivot belongs at l. Note - that if there are elements equal to pivot, l points to the - first slot after them -- that's why this sort is stable. - Slide over to make room. - Caution: using memmove is much slower under MSVC 5; - we're not usually moving many slots. */ - for (p = start; p > l; --p) - *p = *(p-1); - *l = pivot; - if (lo.values != NULL) { - Py_ssize_t offset = lo.values - lo.keys; - p = start + offset; - pivot = *p; - l += offset; - for ( ; p > l; --p) - *p = *(p-1); - *l = pivot; + break; + } + a[M + 1] = pivot; + if (has_values) + v[M + 1] = vpivot; + } +#else // binary insertion sort + Py_ssize_t L, R; + for (; ok < n; ++ok) { + /* set L to where a[ok] belongs */ + L = 0; + R = ok; + pivot = a[ok]; + /* Slice invariants. vacuously true at the start: + * all a[0:L] <= pivot + * all a[L:R] unknown + * all a[R:ok] > pivot + */ + assert(L < R); + do { + /* don't do silly ;-) things to prevent overflow when finding + the midpoint; L and R are very far from filling a Py_ssize_t */ + M = (L + R) >> 1; +#if 1 // straightforward, but highly unpredictable branch on random data + IFLT(pivot, a[M]) + R = M; + else + L = M + 1; +#else + /* Try to get compiler to generate conditional move instructions + instead. Works fine, but leaving it disabled for now because + it's not yielding consistently faster sorts. Needs more + investigation. More computation in the inner loop adds its own + costs, which can be significant when compares are fast. */ + k = ISLT(pivot, a[M]); + if (k < 0) + goto fail; + Py_ssize_t Mp1 = M + 1; + R = k ? M : R; + L = k ? L : Mp1; +#endif + } while (L < R); + assert(L == R); + /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs + at index L. Slide a[L:ok] to the right a slot to make room for it. + Caution: using memmove is much slower under MSVC 5; we're not + usually moving many slots. Years later: under Visual Studio 2022, + memmove seems just slightly slower than doing it "by hand". */ + for (M = ok; M > L; --M) + a[M] = a[M - 1]; + a[L] = pivot; + if (has_values) { + pivot = v[ok]; + for (M = ok; M > L; --M) + v[M] = v[M - 1]; + v[L] = pivot; } } +#endif // pick binary or regular insertion sort return 0; fail: @@ -2559,10 +2633,10 @@ merge_force_collapse(MergeState *ms) /* Compute a good value for the minimum run length; natural runs shorter * than this are boosted artificially via binary insertion. * - * If n < 64, return n (it's too small to bother with fancy stuff). - * Else if n is an exact power of 2, return 32. - * Else return an int k, 32 <= k <= 64, such that n/k is close to, but - * strictly less than, an exact power of 2. + * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff). + * Else if n is an exact power of 2, return MAX_MINRUN / 2. + * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is + * close to, but strictly less than, an exact power of 2. * * See listsort.txt for more info. */ @@ -2572,7 +2646,7 @@ merge_compute_minrun(Py_ssize_t n) Py_ssize_t r = 0; /* becomes 1 if any 1 bits are shifted off */ assert(n >= 0); - while (n >= 64) { + while (n >= MAX_MINRUN) { r |= n & 1; n >>= 1; } @@ -2956,7 +3030,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse) if (n < minrun) { const Py_ssize_t force = nremaining <= minrun ? nremaining : minrun; - if (binarysort(&ms, lo, lo.keys + force, lo.keys + n) < 0) + if (binarysort(&ms, &lo, force, n) < 0) goto fail; n = force; } diff --git a/Objects/listsort.txt b/Objects/listsort.txt index 4f84e2c87da..f387d9c116e 100644 --- a/Objects/listsort.txt +++ b/Objects/listsort.txt @@ -270,9 +270,9 @@ result. This has two primary good effects: Computing minrun ---------------- -If N < 64, minrun is N. IOW, binary insertion sort is used for the whole -array then; it's hard to beat that given the overheads of trying something -fancier (see note BINSORT). +If N < MAX_MINRUN, minrun is N. IOW, binary insertion sort is used for the +whole array then; it's hard to beat that given the overheads of trying +something fancier (see note BINSORT). When N is a power of 2, testing on random data showed that minrun values of 16, 32, 64 and 128 worked about equally well. At 256 the data-movement cost @@ -310,12 +310,13 @@ place, and r < minrun is small compared to N), or q a little larger than a power of 2 regardless of r (then we've got a case similar to "2112", again leaving too little work for the last merge to do). -Instead we pick a minrun in range(32, 65) such that N/minrun is exactly a -power of 2, or if that isn't possible, is close to, but strictly less than, -a power of 2. This is easier to do than it may sound: take the first 6 -bits of N, and add 1 if any of the remaining bits are set. In fact, that -rule covers every case in this section, including small N and exact powers -of 2; merge_compute_minrun() is a deceptively simple function. +Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that +N/minrun is exactly a power of 2, or if that isn't possible, is close to, but +strictly less than, a power of 2. This is easier to do than it may sound: +take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining +bits are set. In fact, that rule covers every case in this section, including +small N and exact powers of 2; merge_compute_minrun() is a deceptively simple +function. The Merge Pattern