linux/lib/raid6/test/Makefile
WANG Xuerui f209132104 raid6: Add LoongArch SIMD recovery implementation
Similar to the syndrome calculation, the recovery algorithms also work
on 64 bytes at a time to align with the L1 cache line size of current
and future LoongArch cores (that we care about). Which means
unrolled-by-4 LSX and unrolled-by-2 LASX code.

The assembly is originally based on the x86 SSSE3/AVX2 ports, but
register allocation has been redone to take advantage of LSX/LASX's 32
vector registers, and instruction sequence has been optimized to suit
(e.g. LoongArch can perform per-byte srl and andi on vectors, but x86
cannot).

Performance numbers measured by instrumenting the raid6test code, on a
3A5000 system clocked at 2.5GHz:

> lasx  2data: 354.987 MiB/s
> lasx  datap: 350.430 MiB/s
> lsx   2data: 340.026 MiB/s
> lsx   datap: 337.318 MiB/s
> intx1 2data: 164.280 MiB/s
> intx1 datap: 187.966 MiB/s

Because recovery algorithms are chosen solely based on priority and
availability, lasx is marked as priority 2 and lsx priority 1. At least
for the current generation of LoongArch micro-architectures, LASX should
always be faster than LSX whenever supported, and have similar power
consumption characteristics (because the only known LASX-capable uarch,
the LA464, always compute the full 256-bit result for vector ops).

Acked-by: Song Liu <song@kernel.org>
Signed-off-by: WANG Xuerui <git@xen0n.name>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
2023-09-06 22:53:55 +08:00

152 lines
4.1 KiB
Makefile

# SPDX-License-Identifier: GPL-2.0
#
# This is a simple Makefile to test some of the RAID-6 code
# from userspace.
#
pound := \#
# Adjust as desired
CC = gcc
OPTFLAGS = -O2
CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
LD = ld
AWK = awk -f
AR = ar
RANLIB = ranlib
OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/)
ifeq ($(ARCH),i386)
CFLAGS += -DCONFIG_X86_32
IS_X86 = yes
endif
ifeq ($(ARCH),x86_64)
CFLAGS += -DCONFIG_X86_64
IS_X86 = yes
endif
ifeq ($(ARCH),arm)
CFLAGS += -I../../../arch/arm/include -mfpu=neon
HAS_NEON = yes
endif
ifeq ($(ARCH),aarch64)
CFLAGS += -I../../../arch/arm64/include
HAS_NEON = yes
endif
ifeq ($(findstring ppc,$(ARCH)),ppc)
CFLAGS += -I../../../arch/powerpc/include
HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\
gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
endif
ifeq ($(ARCH),loongarch64)
CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1
CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' | \
gcc -c -x assembler - >/dev/null 2>&1 && \
rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1)
CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' | \
gcc -c -x assembler - >/dev/null 2>&1 && \
rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1)
endif
ifeq ($(IS_X86),yes)
OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
CFLAGS += -DCONFIG_X86
CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \
gcc -c -x assembler - >/dev/null 2>&1 && \
rm ./-.o && echo -DCONFIG_AS_AVX512=1)
else ifeq ($(HAS_NEON),yes)
OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
else ifeq ($(HAS_ALTIVEC),yes)
CFLAGS += -DCONFIG_ALTIVEC
OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
else ifeq ($(ARCH),loongarch64)
OBJS += loongarch_simd.o recov_loongarch_simd.o
endif
.c.o:
$(CC) $(CFLAGS) -c -o $@ $<
%.c: ../%.c
cp -f $< $@
%.uc: ../%.uc
cp -f $< $@
all: raid6.a raid6test
raid6.a: $(OBJS)
rm -f $@
$(AR) cq $@ $^
$(RANLIB) $@
raid6test: test.c raid6.a
$(CC) $(CFLAGS) -o raid6test $^
neon1.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < neon.uc > $@
neon2.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < neon.uc > $@
neon4.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < neon.uc > $@
neon8.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < neon.uc > $@
altivec1.c: altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < altivec.uc > $@
altivec2.c: altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < altivec.uc > $@
altivec4.c: altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < altivec.uc > $@
altivec8.c: altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < altivec.uc > $@
vpermxor1.c: vpermxor.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < vpermxor.uc > $@
vpermxor2.c: vpermxor.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < vpermxor.uc > $@
vpermxor4.c: vpermxor.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < vpermxor.uc > $@
vpermxor8.c: vpermxor.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < vpermxor.uc > $@
int1.c: int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < int.uc > $@
int2.c: int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < int.uc > $@
int4.c: int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < int.uc > $@
int8.c: int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < int.uc > $@
int16.c: int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=16 < int.uc > $@
int32.c: int.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=32 < int.uc > $@
tables.c: mktables
./mktables > tables.c
clean:
rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c vpermxor*.c neon*.c tables.c raid6test
spotless: clean
rm -f *~