internal/bytealg: optimize Count/CountString for PPC64/Power10

Power10 adds a handful of new instructions which make this
noticeably quicker for smaller values.

Likewise, since the vector loop requires 32B to enter,
unroll it once to count 32B per iteration. This
improvement benefits all PPC64 cpus.

On Power10 comparing a binary built with GOPPC64=power8

CountSingle/10     8.99ns ± 0%    5.55ns ± 3%   -38.24%
CountSingle/16     7.55ns ± 0%    5.56ns ± 3%   -26.37%
CountSingle/17     7.45ns ± 0%    5.25ns ± 0%   -29.52%
CountSingle/31     18.4ns ± 0%     6.2ns ± 0%   -66.41%
CountSingle/32     6.17ns ± 0%    5.04ns ± 0%   -18.37%
CountSingle/33     7.13ns ± 0%    5.99ns ± 0%   -15.94%
CountSingle/4K      198ns ± 0%     115ns ± 0%   -42.08%
CountSingle/4M      190µs ± 0%     109µs ± 0%   -42.49%
CountSingle/64M    3.28ms ± 0%    2.08ms ± 0%   -36.53%

Furthermore, comparing the new tail implementation on
GOPPC64=power8 with GOPPC64=power10:

CountSingle/10     5.55ns ± 3%    4.52ns ± 1%  -18.66%
CountSingle/16     5.56ns ± 3%    4.80ns ± 0%  -13.65%
CountSingle/17     5.25ns ± 0%    4.79ns ± 0%   -8.78%
CountSingle/31     6.17ns ± 0%    4.82ns ± 0%  -21.79%
CountSingle/32     5.04ns ± 0%    5.09ns ± 6%   +1.01%
CountSingle/33     5.99ns ± 0%    5.42ns ± 2%   -9.54%

Change-Id: I62d80be3b5d706e1abbb4bec7d6278a939a5eed4
Reviewed-on: https://go-review.googlesource.com/c/go/+/512695
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Paul E. Murphy 2023-07-17 15:23:28 -05:00 committed by Paul Murphy
parent 02b548e5c8
commit 756841bffa

View file

@ -8,89 +8,147 @@
#include "textflag.h"
TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// R3 = byte array pointer
// R3 = byte array pointer
// R4 = length
MOVBZ R6, R5 // R5 = byte
BR countbytebody<>(SB)
// R6 = byte to count
MTVRD R6, V1 // move compare byte
MOVD R6, R5
VSPLTB $7, V1, V1 // replicate byte across V1
BR countbytebody<>(SB)
TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
// R3 = byte array pointer
// R4 = length
MOVBZ R5, R5 // R5 = byte
BR countbytebody<>(SB)
// R5 = byte to count
MTVRD R5, V1 // move compare byte
VSPLTB $7, V1, V1 // replicate byte across V1
BR countbytebody<>(SB)
// R3: addr of string
// R4: len of string
// R5: byte to count
// V1: byte to count, splatted.
// On exit:
// R3: return value
// endianness shouldn't matter since we are just counting and order
// is irrelevant
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
DCBT (R3) // Prepare cache line.
MOVD R0, R18 // byte count
MOVD R3, R19 // Save base address for calculating the index later.
MOVD R4, R16
MOVD $0, R18 // byte count
MOVD R5, R6
RLDIMI $8, R6, $48, R6
RLDIMI $16, R6, $32, R6
RLDIMI $32, R6, $0, R6 // fill reg with the byte to count
#ifndef GOPPC64_power10
RLDIMI $8, R5, $48, R5
RLDIMI $16, R5, $32, R5
RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
#endif
VSPLTISW $3, V4 // used for shift
MTVRD R6, V1 // move compare byte
VSPLTB $7, V1, V1 // replicate byte across V1
CMPU R4, $32 // Check if it's a small string (<32 bytes)
BLT tail // Jump to the small string case
XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
CMPU R4, $32 // Check if it's a small string (<32 bytes)
BLT tail // Jump to the small string case
SRD $5, R4, R20
MOVD R20, CTR
MOVD $16, R21
XXLXOR V4, V4, V4
XXLXOR V5, V5, V5
PCALIGN $16
cmploop:
LXVW4X (R3), VS32 // load bytes from string
LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
LXVD2X (R21)(R3), V2
VCMPEQUB V2, V1, V2
VCMPEQUB V0, V1, V0
VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
VPOPCNTD V0, V0
VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
ADD $32, R3
BDNZ cmploop
// when the bytes match, the corresponding byte contains all 1s
VCMPEQUB V1, V0, V2 // compare bytes
VPOPCNTD V2, V3 // each double word contains its count
VADDUDM V3, V5, V5 // accumulate bit count in each double word
ADD $16, R3, R3 // increment pointer
SUB $16, R16, R16 // remaining bytes
CMP R16, $16 // at least 16 remaining?
BGE cmploop
VSRD V5, V4, V5 // shift by 3 to convert bits to bytes
VSLDOI $8, V5, V5, V6 // get the double word values from vector
MFVSRD V5, R9
MFVSRD V6, R10
ADD R9, R10, R9
ADD R9, R18, R18
VADDUDM V4, V5, V5
MFVSRD V5, R18
VSLDOI $8, V5, V5, V5
MFVSRD V5, R21
ADD R21, R18, R18
ANDCC $31, R4, R4
// Skip the tail processing if no bytes remaining.
BEQ tail_0
tail:
CMP R16, $8 // 8 bytes left?
BLT small
#ifdef GOPPC64_power10
SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
MOVD (R3), R12 // load 8 bytes
CMPB R12, R6, R17 // compare bytes
POPCNTD R17, R15 // bit count
SRD $3, R15, R15 // byte count
ADD R15, R18, R18 // add to byte count
tail: // Count the last 0 - 31 bytes.
CMP R4, $16
BLE small_tail_p10
LXV 0(R3), V0
VCMPEQUB V0, V1, V0
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
ADD R14, R18, R18
ADD $16, R3, R3
ANDCC $15, R4, R4
next1:
ADD $8, R3, R3
SUB $8, R16, R16 // remaining bytes
BR tail
small:
CMP $0, R16 // any remaining
BEQ done
MOVBZ (R3), R12 // check each remaining byte
CMP R12, R5
BNE next2
ADD $1, R18
next2:
SUB $1, R16
ADD $1, R3 // inc address
BR small
done:
MOVD R18, R3 // return count
small_tail_p10:
SLD $56, R4, R6
LXVLL R3, R6, V0
VCMPEQUB V0, V1, V0
VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
ADD R14, R18, R3
RET
#else
tail: // Count the last 0 - 31 bytes.
CMP R4, $16
BLT tail_8
MOVD (R3), R12
MOVD 8(R3), R14
CMPB R12, R5, R12
CMPB R14, R5, R14
POPCNTD R12, R12
POPCNTD R14, R14
ADD R12, R18, R18
ADD R14, R18, R18
ADD $16, R3, R3
ADD $-16, R4, R4
tail_8: // Count the remaining 0 - 15 bytes.
CMP R4, $8
BLT tail_4
MOVD (R3), R12
CMPB R12, R5, R12
POPCNTD R12, R12
ADD R12, R18, R18
ADD $8, R3, R3
ADD $-8, R4, R4
tail_4: // Count the remaining 0 - 7 bytes.
CMP R4, $4
BLT tail_2
MOVWZ (R3), R12
CMPB R12, R5, R12
SLD $32, R12, R12 // Remove non-participating matches.
POPCNTD R12, R12
ADD R12, R18, R18
ADD $4, R3, R3
ADD $-4, R4, R4
tail_2: // Count the remaining 0 - 3 bytes.
CMP R4, $2
BLT tail_1
MOVHZ (R3), R12
CMPB R12, R5, R12
SLD $48, R12, R12 // Remove non-participating matches.
POPCNTD R12, R12
ADD R12, R18, R18
ADD $2, R3, R3
ADD $-2, R4, R4
tail_1: // Count the remaining 0 - 1 bytes.
CMP R4, $1
BLT tail_0
MOVBZ (R3), R12
CMPB R12, R5, R12
ANDCC $0x8, R12, R12
ADD R12, R18, R18
#endif
tail_0: // No remaining tail to count.
SRD $3, R18, R3 // Fixup count, it is off by 8x.
RET