linux/arch/x86/lib/csum-partial_64.c
Linus Torvalds a476aae3f1 x86/csum: clean up `csum_partial' further
Commit 688eb8191b ("x86/csum: Improve performance of `csum_partial`")
ended up improving the code generation for the IP csum calculations, and
in particular special-casing the 40-byte case that is a hot case for
IPv6 headers.

It then had _another_ special case for the 64-byte unrolled loop, which
did two chains of 32-byte blocks, which allows modern CPU's to improve
performance by doing the chains in parallel thanks to renaming the carry
flag.

This just unifies the special cases and combines them into just one
single helper the 40-byte csum case, and replaces the 64-byte case by a
80-byte case that just does that single helper twice.  It avoids having
all these different versions of inline assembly, and actually improved
performance further in my tests.

There was never anything magical about the 64-byte unrolled case, even
though it happens to be a common size (and typically is the cacheline
size).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-01-04 15:42:30 -08:00

124 lines
3 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* arch/x86_64/lib/csum-partial.c
*
* This file contains network checksum routines that are better done
* in an architecture-specific manner due to speed.
*/
#include <linux/compiler.h>
#include <linux/export.h>
#include <asm/checksum.h>
#include <asm/word-at-a-time.h>
static inline __wsum csum_finalize_sum(u64 temp64)
{
return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32);
}
static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
{
asm("addq %1,%0\n\t"
"adcq %2,%0\n\t"
"adcq %3,%0\n\t"
"adcq %4,%0\n\t"
"adcq %5,%0\n\t"
"adcq $0,%0"
:"+r" (sum)
:"m" (m[0]), "m" (m[1]), "m" (m[2]),
"m" (m[3]), "m" (m[4]));
return sum;
}
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
*
* Still, with CHECKSUM_COMPLETE this is called to compute
* checksums on IPv6 headers (40 bytes) and other small parts.
* it's best to have buff aligned on a 64-bit boundary
*/
__wsum csum_partial(const void *buff, int len, __wsum sum)
{
u64 temp64 = (__force u64)sum;
/* Do two 40-byte chunks in parallel to get better ILP */
if (likely(len >= 80)) {
u64 temp64_2 = 0;
do {
temp64 = update_csum_40b(temp64, buff);
temp64_2 = update_csum_40b(temp64_2, buff + 40);
buff += 80;
len -= 80;
} while (len >= 80);
asm("addq %1,%0\n\t"
"adcq $0,%0"
:"+r" (temp64): "r" (temp64_2));
}
/*
* len == 40 is the hot case due to IPv6 headers, so return
* early for that exact case without checking the tail bytes.
*/
if (len >= 40) {
temp64 = update_csum_40b(temp64, buff);
len -= 40;
if (!len)
return csum_finalize_sum(temp64);
buff += 40;
}
if (len & 32) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq 2*8(%[src]),%[res]\n\t"
"adcq 3*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[32])buff));
buff += 32;
}
if (len & 16) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq 1*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[16])buff));
buff += 16;
}
if (len & 8) {
asm("addq 0*8(%[src]),%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r"(temp64)
: [src] "r"(buff), "m"(*(const char(*)[8])buff));
buff += 8;
}
if (len & 7) {
unsigned int shift = (-len << 3) & 63;
unsigned long trail;
trail = (load_unaligned_zeropad(buff) << shift) >> shift;
asm("addq %[trail],%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r"(temp64)
: [trail] "r"(trail));
}
return csum_finalize_sum(temp64);
}
EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
* in icmp.c
*/
__sum16 ip_compute_csum(const void *buff, int len)
{
return csum_fold(csum_partial(buff, len, 0));
}
EXPORT_SYMBOL(ip_compute_csum);