From 8ff81bb24f68f747ab2f738c3d493b9c2cad52bf Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 6 Sep 2023 22:53:09 +0800 Subject: [PATCH 01/25] LoongArch: Drop unused parse_r and parse_v macros When building with CONFIG_LTO_CLANG_FULL, there are several errors due to the way that parse_r is defined with an __asm__ statement in a header: ld.lld: error: ld-temp.o :105:1: macro 'parse_r' is already defined .macro parse_r var r ^ This was an issue for arch/mips as well, which was resolved by commit 67512a8cf5a7 ("MIPS: Avoid macro redefinitions"). However, parse_r is unused in arch/loongarch after commit 83d8b38967d2 ("LoongArch: Simplify the invtlb wrappers"), so doing the same change does not make much sense now. Just remove parse_r (and parse_v, which is also unused) to resolve the redefinition error. If it needs to be brought back due to an actual use, it should be brought back with the same changes as the aforementioned arch/mips commit. Closes: https://github.com/ClangBuiltLinux/linux/issues/1924 Reviewed-by: WANG Xuerui Signed-off-by: Nathan Chancellor Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/asmmacro.h | 107 ------------------------- arch/loongarch/include/asm/loongarch.h | 43 ---------- 2 files changed, 150 deletions(-) diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h index 79e1d53fea89..af542a8d847f 100644 --- a/arch/loongarch/include/asm/asmmacro.h +++ b/arch/loongarch/include/asm/asmmacro.h @@ -10,113 +10,6 @@ #include #include - .macro parse_v var val - \var = \val - .endm - - .macro parse_r var r - \var = -1 - .ifc \r, $r0 - \var = 0 - .endif - .ifc \r, $r1 - \var = 1 - .endif - .ifc \r, $r2 - \var = 2 - .endif - .ifc \r, $r3 - \var = 3 - .endif - .ifc \r, $r4 - \var = 4 - .endif - .ifc \r, $r5 - \var = 5 - .endif - .ifc \r, $r6 - \var = 6 - .endif - .ifc \r, $r7 - \var = 7 - .endif - .ifc \r, $r8 - \var = 8 - .endif - .ifc \r, $r9 - \var = 9 - .endif - .ifc \r, $r10 - \var = 10 - .endif - .ifc \r, $r11 - \var = 11 - .endif - .ifc \r, $r12 - \var = 12 - .endif - .ifc \r, $r13 - \var = 13 - .endif - .ifc \r, $r14 - \var = 14 - .endif - .ifc \r, $r15 - \var = 15 - .endif - .ifc \r, $r16 - \var = 16 - .endif - .ifc \r, $r17 - \var = 17 - .endif - .ifc \r, $r18 - \var = 18 - .endif - .ifc \r, $r19 - \var = 19 - .endif - .ifc \r, $r20 - \var = 20 - .endif - .ifc \r, $r21 - \var = 21 - .endif - .ifc \r, $r22 - \var = 22 - .endif - .ifc \r, $r23 - \var = 23 - .endif - .ifc \r, $r24 - \var = 24 - .endif - .ifc \r, $r25 - \var = 25 - .endif - .ifc \r, $r26 - \var = 26 - .endif - .ifc \r, $r27 - \var = 27 - .endif - .ifc \r, $r28 - \var = 28 - .endif - .ifc \r, $r29 - \var = 29 - .endif - .ifc \r, $r30 - \var = 30 - .endif - .ifc \r, $r31 - \var = 31 - .endif - .iflt \var - .error "Unable to parse register name \r" - .endif - .endm - .macro cpu_save_nonscratch thread stptr.d s0, \thread, THREAD_REG23 stptr.d s1, \thread, THREAD_REG24 diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 10748a20a2ab..a500efe0fd92 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -12,49 +12,6 @@ #ifndef __ASSEMBLY__ #include -/* - * parse_r var, r - Helper assembler macro for parsing register names. - * - * This converts the register name in $n form provided in \r to the - * corresponding register number, which is assigned to the variable \var. It is - * needed to allow explicit encoding of instructions in inline assembly where - * registers are chosen by the compiler in $n form, allowing us to avoid using - * fixed register numbers. - * - * It also allows newer instructions (not implemented by the assembler) to be - * transparently implemented using assembler macros, instead of needing separate - * cases depending on toolchain support. - * - * Simple usage example: - * __asm__ __volatile__("parse_r addr, %0\n\t" - * "#invtlb op, 0, %0\n\t" - * ".word ((0x6498000) | (addr << 10) | (0 << 5) | op)" - * : "=r" (status); - */ - -/* Match an individual register number and assign to \var */ -#define _IFC_REG(n) \ - ".ifc \\r, $r" #n "\n\t" \ - "\\var = " #n "\n\t" \ - ".endif\n\t" - -__asm__(".macro parse_r var r\n\t" - "\\var = -1\n\t" - _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) - _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) - _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) - _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) - _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) - _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) - _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) - _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) - ".iflt \\var\n\t" - ".error \"Unable to parse register name \\r\"\n\t" - ".endif\n\t" - ".endm"); - -#undef _IFC_REG - /* CPUCFG */ #define read_cpucfg(reg) __cpucfg(reg) From 303be4b33562a5b689261ced1616bf16ad49efa7 Mon Sep 17 00:00:00 2001 From: Hongchen Zhang Date: Wed, 6 Sep 2023 22:53:09 +0800 Subject: [PATCH 02/25] LoongArch: mm: Add p?d_leaf() definitions When I do LTP test, LTP test case ksm06 caused panic at break_ksm_pmd_entry -> pmd_leaf (Huge page table but False) -> pte_present (panic) The reason is pmd_leaf() is not defined, So like commit 501b81046701 ("mips: mm: add p?d_leaf() definitions") add p?d_leaf() definition for LoongArch. Fixes: 09cfefb7fa70 ("LoongArch: Add memory management") Cc: stable@vger.kernel.org Acked-by: David Hildenbrand Signed-off-by: Hongchen Zhang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 38afeb7dd58b..0ac6afa4a825 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -593,6 +593,9 @@ static inline long pmd_protnone(pmd_t pmd) } #endif /* CONFIG_NUMA_BALANCING */ +#define pmd_leaf(pmd) ((pmd_val(pmd) & _PAGE_HUGE) != 0) +#define pud_leaf(pud) ((pud_val(pud) & _PAGE_HUGE) != 0) + /* * We provide our own get_unmapped area to cope with the virtual aliasing * constraints placed on us by the cache architecture. From ad3ff105611b9b06e16ae57e97b48916ff93dd46 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 6 Sep 2023 22:53:09 +0800 Subject: [PATCH 03/25] LoongArch: Remove shm_align_mask and use SHMLBA instead Both shm_align_mask and SHMLBA want to avoid cache alias. But they are inconsistent: shm_align_mask is (PAGE_SIZE - 1) while SHMLBA is SZ_64K, but PAGE_SIZE is not always equal to SZ_64K. This may cause problems when shmat() twice. Fix this problem by removing shm_align_mask and using SHMLBA (strictly SHMLBA - 1) instead. Reported-by: Jiantao Shan Signed-off-by: Huacai Chen --- arch/loongarch/mm/cache.c | 1 - arch/loongarch/mm/mmap.c | 13 ++++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/mm/cache.c b/arch/loongarch/mm/cache.c index 72685a48eaf0..6be04d36ca07 100644 --- a/arch/loongarch/mm/cache.c +++ b/arch/loongarch/mm/cache.c @@ -156,7 +156,6 @@ void cpu_cache_init(void) current_cpu_data.cache_leaves_present = leaf; current_cpu_data.options |= LOONGARCH_CPU_PREFETCH; - shm_align_mask = PAGE_SIZE - 1; } static const pgprot_t protection_map[16] = { diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c index fbe1a4856fc4..a9630a81b38a 100644 --- a/arch/loongarch/mm/mmap.c +++ b/arch/loongarch/mm/mmap.c @@ -8,12 +8,11 @@ #include #include -unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ -EXPORT_SYMBOL(shm_align_mask); +#define SHM_ALIGN_MASK (SHMLBA - 1) -#define COLOUR_ALIGN(addr, pgoff) \ - ((((addr) + shm_align_mask) & ~shm_align_mask) + \ - (((pgoff) << PAGE_SHIFT) & shm_align_mask)) +#define COLOUR_ALIGN(addr, pgoff) \ + ((((addr) + SHM_ALIGN_MASK) & ~SHM_ALIGN_MASK) \ + + (((pgoff) << PAGE_SHIFT) & SHM_ALIGN_MASK)) enum mmap_allocation_direction {UP, DOWN}; @@ -40,7 +39,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, * cache aliasing constraints. */ if ((flags & MAP_SHARED) && - ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) + ((addr - (pgoff << PAGE_SHIFT)) & SHM_ALIGN_MASK)) return -EINVAL; return addr; } @@ -63,7 +62,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp, } info.length = len; - info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0; + info.align_mask = do_color_align ? (PAGE_MASK & SHM_ALIGN_MASK) : 0; info.align_offset = pgoff << PAGE_SHIFT; if (dir == DOWN) { From f33efa905ce4839d9d1f20b559db9c2e8a39e059 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 6 Sep 2023 22:53:09 +0800 Subject: [PATCH 04/25] LoongArch: Code improvements in function pcpu_populate_pte() Do some code improvements in function pcpu_populate_pte(): 1. Add memory allocation failure handling; 2. Replace pgd_populate() with p4d_populate(), it will be useful if there are four-level page tables. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kernel/numa.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 708665895b47..4844ad2d7712 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -73,32 +73,34 @@ void __init pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - pgd_populate(&init_mm, pgd, new); + pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pud) + panic("%s: Failed to allocate memory\n", __func__); + p4d_populate(&init_mm, p4d, pud); #ifndef __PAGETABLE_PUD_FOLDED - pud_init(new); + pud_init(pud); #endif } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - pud_populate(&init_mm, pud, new); + pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pmd) + panic("%s: Failed to allocate memory\n", __func__); + pud_populate(&init_mm, pud, pmd); #ifndef __PAGETABLE_PMD_FOLDED - pmd_init(new); + pmd_init(pmd); #endif } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) { - pte_t *new; + pte_t *pte; - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - pmd_populate_kernel(&init_mm, pmd, new); + pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pte) + panic("%s: Failed to allocate memory\n", __func__); + pmd_populate_kernel(&init_mm, pmd, pte); } } From 2bb20d2926a8ea991386315aa8017990ef7beb6a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 6 Sep 2023 22:53:09 +0800 Subject: [PATCH 05/25] LoongArch: mm: Introduce unified function populate_kernel_pte() Function pcpu_populate_pte() and fixmap_pte() are similar, they populate one page from kernel address space. And there is confusion between pgd and p4d in the function fixmap_pte(), such as pgd_none() always returns zero. This patch introduces a unified function populate_kernel_pte() and then replaces pcpu_populate_pte() and fixmap_pte(). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/pgalloc.h | 1 + arch/loongarch/kernel/numa.c | 36 +---------------------- arch/loongarch/mm/init.c | 43 ++++++++++++++-------------- 3 files changed, 23 insertions(+), 57 deletions(-) diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index af1d1e4a6965..ca17b573dba6 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -91,4 +91,5 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) #endif /* __PAGETABLE_PUD_FOLDED */ +extern pte_t * __init populate_kernel_pte(unsigned long addr); #endif /* _ASM_PGALLOC_H */ diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 4844ad2d7712..cb00804826f7 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -67,41 +67,7 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) void __init pcpu_populate_pte(unsigned long addr) { - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud; - pmd_t *pmd; - - if (p4d_none(*p4d)) { - pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pud) - panic("%s: Failed to allocate memory\n", __func__); - p4d_populate(&init_mm, p4d, pud); -#ifndef __PAGETABLE_PUD_FOLDED - pud_init(pud); -#endif - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate memory\n", __func__); - pud_populate(&init_mm, pud, pmd); -#ifndef __PAGETABLE_PMD_FOLDED - pmd_init(pmd); -#endif - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *pte; - - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate memory\n", __func__); - pmd_populate_kernel(&init_mm, pmd, pte); - } + populate_kernel_pte(addr); } void __init setup_per_cpu_areas(void) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 3b7d8129570b..0f1dee285da2 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -191,43 +191,42 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *al #endif #endif -static pte_t *fixmap_pte(unsigned long addr) +pte_t * __init populate_kernel_pte(unsigned long addr) { - pgd_t *pgd; - p4d_t *p4d; + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud; pmd_t *pmd; - pgd = pgd_offset_k(addr); - p4d = p4d_offset(pgd, addr); - - if (pgd_none(*pgd)) { - pud_t *new __maybe_unused; - - new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); - pgd_populate(&init_mm, pgd, new); + if (p4d_none(*p4d)) { + pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pud) + panic("%s: Failed to allocate memory\n", __func__); + p4d_populate(&init_mm, p4d, pud); #ifndef __PAGETABLE_PUD_FOLDED - pud_init(new); + pud_init(pud); #endif } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd_t *new __maybe_unused; - - new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); - pud_populate(&init_mm, pud, new); + pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pmd) + panic("%s: Failed to allocate memory\n", __func__); + pud_populate(&init_mm, pud, pmd); #ifndef __PAGETABLE_PMD_FOLDED - pmd_init(new); + pmd_init(pmd); #endif } pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - pte_t *new __maybe_unused; + if (!pmd_present(*pmd)) { + pte_t *pte; - new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); - pmd_populate_kernel(&init_mm, pmd, new); + pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + if (!pte) + panic("%s: Failed to allocate memory\n", __func__); + pmd_populate_kernel(&init_mm, pmd, pte); } return pte_offset_kernel(pmd, addr); @@ -241,7 +240,7 @@ void __init __set_fixmap(enum fixed_addresses idx, BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); - ptep = fixmap_pte(addr); + ptep = populate_kernel_pte(addr); if (!pte_none(*ptep)) { pte_ERROR(*ptep); return; From 0921af6ccfb37dc2d6aefcf744333c14e7ca739d Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 6 Sep 2023 22:53:10 +0800 Subject: [PATCH 06/25] LoongArch: Use static defined zero page rather than allocated On LoongArch system, there is only one page needed for zero page (no cache synonyms), and there is no COLOR_ZERO_PAGE, so zero_page_mask is useless and the macro __HAVE_COLOR_ZERO_PAGE is not necessary. Like other popular architectures, It is simpler to define the zero page in kernel BSS code segment rather than dynamically allocate. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/mmzone.h | 2 -- arch/loongarch/include/asm/pgtable.h | 7 ++----- arch/loongarch/kernel/numa.c | 1 - arch/loongarch/mm/init.c | 28 +--------------------------- 4 files changed, 3 insertions(+), 35 deletions(-) diff --git a/arch/loongarch/include/asm/mmzone.h b/arch/loongarch/include/asm/mmzone.h index fe67d0b4b33d..2b9a90727e19 100644 --- a/arch/loongarch/include/asm/mmzone.h +++ b/arch/loongarch/include/asm/mmzone.h @@ -13,6 +13,4 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[(nid)]) -extern void setup_zero_pages(void); - #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 0ac6afa4a825..7699af049443 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -70,12 +70,9 @@ struct vm_area_struct; * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page; -extern unsigned long zero_page_mask; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) \ - (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) -#define __HAVE_COLOR_ZERO_PAGE +#define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) /* * TLB refill handlers may also map the vmalloc area into xkvrange. diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index cb00804826f7..c7d33c489e04 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -438,7 +438,6 @@ void __init mem_init(void) { high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); memblock_free_all(); - setup_zero_pages(); /* This comes from node 0 */ } int pcibus_to_node(struct pci_bus *bus) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 0f1dee285da2..f3fe8c06ba4d 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -35,33 +35,8 @@ #include #include -/* - * We have up to 8 empty zeroed pages so we can map one of the right colour - * when needed. Since page is never written to after the initialization we - * don't have to care about aliases on other CPUs. - */ -unsigned long empty_zero_page, zero_page_mask; +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(zero_page_mask); - -void setup_zero_pages(void) -{ - unsigned int order, i; - struct page *page; - - order = 0; - - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Oh boy, that early out of memory?"); - - page = virt_to_page((void *)empty_zero_page); - split_page(page, order); - for (i = 0; i < (1 << order); i++, page++) - mark_page_reserved(page); - - zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; -} void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) @@ -106,7 +81,6 @@ void __init mem_init(void) high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); - setup_zero_pages(); /* Setup zeroed pages. */ } #endif /* !CONFIG_NUMA */ From 937f65935950a0346292120acc5d98033f90e61c Mon Sep 17 00:00:00 2001 From: Weihao Li Date: Wed, 6 Sep 2023 22:53:10 +0800 Subject: [PATCH 07/25] LoongArch: Adjust {copy, clear}_user exception handler behavior The {copy, clear}_user function should returns number of bytes that could not be {copied, cleared}. So, try to {copy, clear} byte by byte when ld.{d,w,h} and st.{d,w,h} trapped into an exception. Reviewed-by: WANG Rui Signed-off-by: Weihao Li Signed-off-by: Huacai Chen --- arch/loongarch/lib/clear_user.S | 87 ++++++++--------- arch/loongarch/lib/copy_user.S | 161 ++++++++++++++++---------------- 2 files changed, 127 insertions(+), 121 deletions(-) diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S index 0790eadce166..be741544e62b 100644 --- a/arch/loongarch/lib/clear_user.S +++ b/arch/loongarch/lib/clear_user.S @@ -11,19 +11,6 @@ #include #include -.irp to, 0, 1, 2, 3, 4, 5, 6, 7 -.L_fixup_handle_\to\(): - sub.d a0, a2, a0 - addi.d a0, a0, (\to) * (-8) - jr ra -.endr - -.irp to, 0, 2, 4 -.L_fixup_handle_s\to\(): - addi.d a0, a1, -\to - jr ra -.endr - SYM_FUNC_START(__clear_user) /* * Some CPUs support hardware unaligned access @@ -51,7 +38,7 @@ SYM_FUNC_START(__clear_user_generic) 2: move a0, a1 jr ra - _asm_extable 1b, .L_fixup_handle_s0 + _asm_extable 1b, 2b SYM_FUNC_END(__clear_user_generic) /* @@ -173,33 +160,47 @@ SYM_FUNC_START(__clear_user_fast) jr ra /* fixup and ex_table */ - _asm_extable 0b, .L_fixup_handle_0 - _asm_extable 1b, .L_fixup_handle_0 - _asm_extable 2b, .L_fixup_handle_1 - _asm_extable 3b, .L_fixup_handle_2 - _asm_extable 4b, .L_fixup_handle_3 - _asm_extable 5b, .L_fixup_handle_4 - _asm_extable 6b, .L_fixup_handle_5 - _asm_extable 7b, .L_fixup_handle_6 - _asm_extable 8b, .L_fixup_handle_7 - _asm_extable 9b, .L_fixup_handle_0 - _asm_extable 10b, .L_fixup_handle_1 - _asm_extable 11b, .L_fixup_handle_2 - _asm_extable 12b, .L_fixup_handle_3 - _asm_extable 13b, .L_fixup_handle_0 - _asm_extable 14b, .L_fixup_handle_1 - _asm_extable 15b, .L_fixup_handle_0 - _asm_extable 16b, .L_fixup_handle_0 - _asm_extable 17b, .L_fixup_handle_s0 - _asm_extable 18b, .L_fixup_handle_s0 - _asm_extable 19b, .L_fixup_handle_s0 - _asm_extable 20b, .L_fixup_handle_s2 - _asm_extable 21b, .L_fixup_handle_s0 - _asm_extable 22b, .L_fixup_handle_s0 - _asm_extable 23b, .L_fixup_handle_s4 - _asm_extable 24b, .L_fixup_handle_s0 - _asm_extable 25b, .L_fixup_handle_s4 - _asm_extable 26b, .L_fixup_handle_s0 - _asm_extable 27b, .L_fixup_handle_s4 - _asm_extable 28b, .L_fixup_handle_s0 +.Llarge_fixup: + sub.d a1, a2, a0 + +.Lsmall_fixup: +29: st.b zero, a0, 0 + addi.d a0, a0, 1 + addi.d a1, a1, -1 + bgt a1, zero, 29b + +.Lexit: + move a0, a1 + jr ra + + _asm_extable 0b, .Lsmall_fixup + _asm_extable 1b, .Llarge_fixup + _asm_extable 2b, .Llarge_fixup + _asm_extable 3b, .Llarge_fixup + _asm_extable 4b, .Llarge_fixup + _asm_extable 5b, .Llarge_fixup + _asm_extable 6b, .Llarge_fixup + _asm_extable 7b, .Llarge_fixup + _asm_extable 8b, .Llarge_fixup + _asm_extable 9b, .Llarge_fixup + _asm_extable 10b, .Llarge_fixup + _asm_extable 11b, .Llarge_fixup + _asm_extable 12b, .Llarge_fixup + _asm_extable 13b, .Llarge_fixup + _asm_extable 14b, .Llarge_fixup + _asm_extable 15b, .Llarge_fixup + _asm_extable 16b, .Llarge_fixup + _asm_extable 17b, .Lexit + _asm_extable 18b, .Lsmall_fixup + _asm_extable 19b, .Lsmall_fixup + _asm_extable 20b, .Lsmall_fixup + _asm_extable 21b, .Lsmall_fixup + _asm_extable 22b, .Lsmall_fixup + _asm_extable 23b, .Lsmall_fixup + _asm_extable 24b, .Lsmall_fixup + _asm_extable 25b, .Lsmall_fixup + _asm_extable 26b, .Lsmall_fixup + _asm_extable 27b, .Lsmall_fixup + _asm_extable 28b, .Lsmall_fixup + _asm_extable 29b, .Lexit SYM_FUNC_END(__clear_user_fast) diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S index bfe3d2793d00..feec3d362803 100644 --- a/arch/loongarch/lib/copy_user.S +++ b/arch/loongarch/lib/copy_user.S @@ -11,19 +11,6 @@ #include #include -.irp to, 0, 1, 2, 3, 4, 5, 6, 7 -.L_fixup_handle_\to\(): - sub.d a0, a2, a0 - addi.d a0, a0, (\to) * (-8) - jr ra -.endr - -.irp to, 0, 2, 4 -.L_fixup_handle_s\to\(): - addi.d a0, a2, -\to - jr ra -.endr - SYM_FUNC_START(__copy_user) /* * Some CPUs support hardware unaligned access @@ -54,8 +41,8 @@ SYM_FUNC_START(__copy_user_generic) 3: move a0, a2 jr ra - _asm_extable 1b, .L_fixup_handle_s0 - _asm_extable 2b, .L_fixup_handle_s0 + _asm_extable 1b, 3b + _asm_extable 2b, 3b SYM_FUNC_END(__copy_user_generic) /* @@ -69,10 +56,10 @@ SYM_FUNC_START(__copy_user_fast) sltui t0, a2, 9 bnez t0, .Lsmall - add.d a3, a1, a2 - add.d a2, a0, a2 0: ld.d t0, a1, 0 1: st.d t0, a0, 0 + add.d a3, a1, a2 + add.d a2, a0, a2 /* align up destination address */ andi t1, a0, 7 @@ -94,7 +81,6 @@ SYM_FUNC_START(__copy_user_fast) 7: ld.d t5, a1, 40 8: ld.d t6, a1, 48 9: ld.d t7, a1, 56 - addi.d a1, a1, 64 10: st.d t0, a0, 0 11: st.d t1, a0, 8 12: st.d t2, a0, 16 @@ -103,6 +89,7 @@ SYM_FUNC_START(__copy_user_fast) 15: st.d t5, a0, 40 16: st.d t6, a0, 48 17: st.d t7, a0, 56 + addi.d a1, a1, 64 addi.d a0, a0, 64 bltu a1, a4, .Lloop64 @@ -114,11 +101,11 @@ SYM_FUNC_START(__copy_user_fast) 19: ld.d t1, a1, 8 20: ld.d t2, a1, 16 21: ld.d t3, a1, 24 - addi.d a1, a1, 32 22: st.d t0, a0, 0 23: st.d t1, a0, 8 24: st.d t2, a0, 16 25: st.d t3, a0, 24 + addi.d a1, a1, 32 addi.d a0, a0, 32 .Llt32: @@ -126,9 +113,9 @@ SYM_FUNC_START(__copy_user_fast) bgeu a1, a4, .Llt16 26: ld.d t0, a1, 0 27: ld.d t1, a1, 8 - addi.d a1, a1, 16 28: st.d t0, a0, 0 29: st.d t1, a0, 8 + addi.d a1, a1, 16 addi.d a0, a0, 16 .Llt16: @@ -136,6 +123,7 @@ SYM_FUNC_START(__copy_user_fast) bgeu a1, a4, .Llt8 30: ld.d t0, a1, 0 31: st.d t0, a0, 0 + addi.d a1, a1, 8 addi.d a0, a0, 8 .Llt8: @@ -214,62 +202,79 @@ SYM_FUNC_START(__copy_user_fast) jr ra /* fixup and ex_table */ - _asm_extable 0b, .L_fixup_handle_0 - _asm_extable 1b, .L_fixup_handle_0 - _asm_extable 2b, .L_fixup_handle_0 - _asm_extable 3b, .L_fixup_handle_0 - _asm_extable 4b, .L_fixup_handle_0 - _asm_extable 5b, .L_fixup_handle_0 - _asm_extable 6b, .L_fixup_handle_0 - _asm_extable 7b, .L_fixup_handle_0 - _asm_extable 8b, .L_fixup_handle_0 - _asm_extable 9b, .L_fixup_handle_0 - _asm_extable 10b, .L_fixup_handle_0 - _asm_extable 11b, .L_fixup_handle_1 - _asm_extable 12b, .L_fixup_handle_2 - _asm_extable 13b, .L_fixup_handle_3 - _asm_extable 14b, .L_fixup_handle_4 - _asm_extable 15b, .L_fixup_handle_5 - _asm_extable 16b, .L_fixup_handle_6 - _asm_extable 17b, .L_fixup_handle_7 - _asm_extable 18b, .L_fixup_handle_0 - _asm_extable 19b, .L_fixup_handle_0 - _asm_extable 20b, .L_fixup_handle_0 - _asm_extable 21b, .L_fixup_handle_0 - _asm_extable 22b, .L_fixup_handle_0 - _asm_extable 23b, .L_fixup_handle_1 - _asm_extable 24b, .L_fixup_handle_2 - _asm_extable 25b, .L_fixup_handle_3 - _asm_extable 26b, .L_fixup_handle_0 - _asm_extable 27b, .L_fixup_handle_0 - _asm_extable 28b, .L_fixup_handle_0 - _asm_extable 29b, .L_fixup_handle_1 - _asm_extable 30b, .L_fixup_handle_0 - _asm_extable 31b, .L_fixup_handle_0 - _asm_extable 32b, .L_fixup_handle_0 - _asm_extable 33b, .L_fixup_handle_0 - _asm_extable 34b, .L_fixup_handle_s0 - _asm_extable 35b, .L_fixup_handle_s0 - _asm_extable 36b, .L_fixup_handle_s0 - _asm_extable 37b, .L_fixup_handle_s0 - _asm_extable 38b, .L_fixup_handle_s0 - _asm_extable 39b, .L_fixup_handle_s0 - _asm_extable 40b, .L_fixup_handle_s0 - _asm_extable 41b, .L_fixup_handle_s2 - _asm_extable 42b, .L_fixup_handle_s0 - _asm_extable 43b, .L_fixup_handle_s0 - _asm_extable 44b, .L_fixup_handle_s0 - _asm_extable 45b, .L_fixup_handle_s0 - _asm_extable 46b, .L_fixup_handle_s0 - _asm_extable 47b, .L_fixup_handle_s4 - _asm_extable 48b, .L_fixup_handle_s0 - _asm_extable 49b, .L_fixup_handle_s0 - _asm_extable 50b, .L_fixup_handle_s0 - _asm_extable 51b, .L_fixup_handle_s4 - _asm_extable 52b, .L_fixup_handle_s0 - _asm_extable 53b, .L_fixup_handle_s0 - _asm_extable 54b, .L_fixup_handle_s0 - _asm_extable 55b, .L_fixup_handle_s4 - _asm_extable 56b, .L_fixup_handle_s0 - _asm_extable 57b, .L_fixup_handle_s0 +.Llarge_fixup: + sub.d a2, a2, a0 + +.Lsmall_fixup: +58: ld.b t0, a1, 0 +59: st.b t0, a0, 0 + addi.d a0, a0, 1 + addi.d a1, a1, 1 + addi.d a2, a2, -1 + bgt a2, zero, 58b + +.Lexit: + move a0, a2 + jr ra + + _asm_extable 0b, .Lsmall_fixup + _asm_extable 1b, .Lsmall_fixup + _asm_extable 2b, .Llarge_fixup + _asm_extable 3b, .Llarge_fixup + _asm_extable 4b, .Llarge_fixup + _asm_extable 5b, .Llarge_fixup + _asm_extable 6b, .Llarge_fixup + _asm_extable 7b, .Llarge_fixup + _asm_extable 8b, .Llarge_fixup + _asm_extable 9b, .Llarge_fixup + _asm_extable 10b, .Llarge_fixup + _asm_extable 11b, .Llarge_fixup + _asm_extable 12b, .Llarge_fixup + _asm_extable 13b, .Llarge_fixup + _asm_extable 14b, .Llarge_fixup + _asm_extable 15b, .Llarge_fixup + _asm_extable 16b, .Llarge_fixup + _asm_extable 17b, .Llarge_fixup + _asm_extable 18b, .Llarge_fixup + _asm_extable 19b, .Llarge_fixup + _asm_extable 20b, .Llarge_fixup + _asm_extable 21b, .Llarge_fixup + _asm_extable 22b, .Llarge_fixup + _asm_extable 23b, .Llarge_fixup + _asm_extable 24b, .Llarge_fixup + _asm_extable 25b, .Llarge_fixup + _asm_extable 26b, .Llarge_fixup + _asm_extable 27b, .Llarge_fixup + _asm_extable 28b, .Llarge_fixup + _asm_extable 29b, .Llarge_fixup + _asm_extable 30b, .Llarge_fixup + _asm_extable 31b, .Llarge_fixup + _asm_extable 32b, .Llarge_fixup + _asm_extable 33b, .Llarge_fixup + _asm_extable 34b, .Lexit + _asm_extable 35b, .Lexit + _asm_extable 36b, .Lsmall_fixup + _asm_extable 37b, .Lsmall_fixup + _asm_extable 38b, .Lsmall_fixup + _asm_extable 39b, .Lsmall_fixup + _asm_extable 40b, .Lsmall_fixup + _asm_extable 41b, .Lsmall_fixup + _asm_extable 42b, .Lsmall_fixup + _asm_extable 43b, .Lsmall_fixup + _asm_extable 44b, .Lsmall_fixup + _asm_extable 45b, .Lsmall_fixup + _asm_extable 46b, .Lsmall_fixup + _asm_extable 47b, .Lsmall_fixup + _asm_extable 48b, .Lsmall_fixup + _asm_extable 49b, .Lsmall_fixup + _asm_extable 50b, .Lsmall_fixup + _asm_extable 51b, .Lsmall_fixup + _asm_extable 52b, .Lsmall_fixup + _asm_extable 53b, .Lsmall_fixup + _asm_extable 54b, .Lsmall_fixup + _asm_extable 55b, .Lsmall_fixup + _asm_extable 56b, .Lsmall_fixup + _asm_extable 57b, .Lsmall_fixup + _asm_extable 58b, .Lexit + _asm_extable 59b, .Lexit SYM_FUNC_END(__copy_user_fast) From 8f58c571bf3095278ebda49ce95df5ead27ebb42 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 6 Sep 2023 22:53:10 +0800 Subject: [PATCH 08/25] LoongArch: Define symbol 'fault' as a local label in fpu.S The initial aim is to silence the following objtool warnings: arch/loongarch/kernel/fpu.o: warning: objtool: _save_fp_context() falls through to next function fault() arch/loongarch/kernel/fpu.o: warning: objtool: _restore_fp_context() falls through to next function fault() arch/loongarch/kernel/fpu.o: warning: objtool: _save_lsx_context() falls through to next function fault() arch/loongarch/kernel/fpu.o: warning: objtool: _restore_lsx_context() falls through to next function fault() arch/loongarch/kernel/fpu.o: warning: objtool: _save_lasx_context() falls through to next function fault() arch/loongarch/kernel/fpu.o: warning: objtool: _restore_lasx_context() falls through to next function fault() Currently, SYM_FUNC_START()/SYM_FUNC_END() defines the symbol 'fault' as SYM_T_FUNC which is STT_FUNC, the objtool warnings are generated through the following code: tools/objtool/include/objtool/check.h: static inline struct symbol *insn_func(struct instruction *insn) { struct symbol *sym = insn->sym; if (sym && sym->type != STT_FUNC) sym = NULL; return sym; } tools/objtool/check.c: static int validate_branch(struct objtool_file *file, struct symbol *func, struct instruction *insn, struct insn_state state) { ... if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { ... WARN("%s() falls through to next function %s()", func->name, insn_func(insn)->name); return 1; } ... } We can see that the fixup can be a local label in the following code: arch/loongarch/include/asm/asm-extable.h: .pushsection __ex_table, "a"; \ .balign 4; \ .long ((insn) - .); \ .long ((fixup) - .); \ .short (type); \ .short (data); \ .popsection; .macro _asm_extable, insn, fixup __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0) .endm Like arch/loongarch/lib/*.S, just define the symbol 'fault' as a local label in fpu.S. Before: $ readelf -s arch/loongarch/kernel/fpu.o | awk -F: /fault/'{print $2}' 000000000000053c 8 FUNC GLOBAL DEFAULT 1 fault After: $ readelf -s arch/loongarch/kernel/fpu.o | awk -F: /fault/'{print $2}' 000000000000053c 0 NOTYPE LOCAL DEFAULT 1 .L_fpu_fault Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/fpu.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 501094a09f5d..80dccf8e39a7 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -22,7 +22,7 @@ .macro EX insn, reg, src, offs .ex\@: \insn \reg, \src, \offs - _asm_extable .ex\@, fault + _asm_extable .ex\@, .L_fpu_fault .endm .macro sc_save_fp base @@ -514,7 +514,6 @@ SYM_FUNC_START(_restore_lasx_context) jr ra SYM_FUNC_END(_restore_lasx_context) -SYM_FUNC_START(fault) +.L_fpu_fault: li.w a0, -EFAULT # failure jr ra -SYM_FUNC_END(fault) From 2478e4b7593a2a55073a4a6bf23dc885c19befd8 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 09/25] LoongArch: Allow usage of LSX/LASX in the kernel Allow usage of LSX/LASX in the kernel by extending kernel_fpu_begin() and kernel_fpu_end(). Reviewed-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/kfpu.c | 55 +++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c index 5c46ae8c6cac..ec5b28e570c9 100644 --- a/arch/loongarch/kernel/kfpu.c +++ b/arch/loongarch/kernel/kfpu.c @@ -8,19 +8,40 @@ #include #include +static unsigned int euen_mask = CSR_EUEN_FPEN; + +/* + * The critical section between kernel_fpu_begin() and kernel_fpu_end() + * is non-reentrant. It is the caller's responsibility to avoid reentrance. + * See drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c as an example. + */ static DEFINE_PER_CPU(bool, in_kernel_fpu); +static DEFINE_PER_CPU(unsigned int, euen_current); void kernel_fpu_begin(void) { + unsigned int *euen_curr; + preempt_disable(); WARN_ON(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); + euen_curr = this_cpu_ptr(&euen_current); - if (!is_fpu_owner()) - enable_fpu(); + *euen_curr = csr_xchg32(euen_mask, euen_mask, LOONGARCH_CSR_EUEN); + +#ifdef CONFIG_CPU_HAS_LASX + if (*euen_curr & CSR_EUEN_LASXEN) + _save_lasx(¤t->thread.fpu); else +#endif +#ifdef CONFIG_CPU_HAS_LSX + if (*euen_curr & CSR_EUEN_LSXEN) + _save_lsx(¤t->thread.fpu); + else +#endif + if (*euen_curr & CSR_EUEN_FPEN) _save_fp(¤t->thread.fpu); write_fcsr(LOONGARCH_FCSR0, 0); @@ -29,15 +50,41 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin); void kernel_fpu_end(void) { + unsigned int *euen_curr; + WARN_ON(!this_cpu_read(in_kernel_fpu)); - if (!is_fpu_owner()) - disable_fpu(); + euen_curr = this_cpu_ptr(&euen_current); + +#ifdef CONFIG_CPU_HAS_LASX + if (*euen_curr & CSR_EUEN_LASXEN) + _restore_lasx(¤t->thread.fpu); else +#endif +#ifdef CONFIG_CPU_HAS_LSX + if (*euen_curr & CSR_EUEN_LSXEN) + _restore_lsx(¤t->thread.fpu); + else +#endif + if (*euen_curr & CSR_EUEN_FPEN) _restore_fp(¤t->thread.fpu); + *euen_curr = csr_xchg32(*euen_curr, euen_mask, LOONGARCH_CSR_EUEN); + this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); + +static int __init init_euen_mask(void) +{ + if (cpu_has_lsx) + euen_mask |= CSR_EUEN_LSXEN; + + if (cpu_has_lasx) + euen_mask |= CSR_EUEN_LASXEN; + + return 0; +} +arch_initcall(init_euen_mask); From 75ded18a5e8e51ca2d26d55f010d60ae9aab652c Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 10/25] LoongArch: Add SIMD-optimized XOR routines Add LSX and LASX implementations of xor operations, operating on 64 bytes (one L1 cache line) at a time, for a balance between memory utilization and instruction mix. Huacai confirmed that all future LoongArch implementations by Loongson (that we care) will likely also feature 64-byte cache lines, and experiments show no throughput improvement with further unrolling. Performance numbers measured during system boot on a 3A5000 @ 2.5GHz: > 8regs : 12702 MB/sec > 8regs_prefetch : 10920 MB/sec > 32regs : 12686 MB/sec > 32regs_prefetch : 10918 MB/sec > lsx : 17589 MB/sec > lasx : 26116 MB/sec Acked-by: Song Liu Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/xor.h | 68 ++++++++++++++++ arch/loongarch/include/asm/xor_simd.h | 34 ++++++++ arch/loongarch/lib/Makefile | 2 + arch/loongarch/lib/xor_simd.c | 93 ++++++++++++++++++++++ arch/loongarch/lib/xor_simd.h | 38 +++++++++ arch/loongarch/lib/xor_simd_glue.c | 72 +++++++++++++++++ arch/loongarch/lib/xor_template.c | 110 ++++++++++++++++++++++++++ 7 files changed, 417 insertions(+) create mode 100644 arch/loongarch/include/asm/xor.h create mode 100644 arch/loongarch/include/asm/xor_simd.h create mode 100644 arch/loongarch/lib/xor_simd.c create mode 100644 arch/loongarch/lib/xor_simd.h create mode 100644 arch/loongarch/lib/xor_simd_glue.c create mode 100644 arch/loongarch/lib/xor_template.c diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h new file mode 100644 index 000000000000..12467fffee46 --- /dev/null +++ b/arch/loongarch/include/asm/xor.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui + */ +#ifndef _ASM_LOONGARCH_XOR_H +#define _ASM_LOONGARCH_XOR_H + +#include +#include + +#ifdef CONFIG_CPU_HAS_LSX +static struct xor_block_template xor_block_lsx = { + .name = "lsx", + .do_2 = xor_lsx_2, + .do_3 = xor_lsx_3, + .do_4 = xor_lsx_4, + .do_5 = xor_lsx_5, +}; + +#define XOR_SPEED_LSX() \ + do { \ + if (cpu_has_lsx) \ + xor_speed(&xor_block_lsx); \ + } while (0) +#else /* CONFIG_CPU_HAS_LSX */ +#define XOR_SPEED_LSX() +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +static struct xor_block_template xor_block_lasx = { + .name = "lasx", + .do_2 = xor_lasx_2, + .do_3 = xor_lasx_3, + .do_4 = xor_lasx_4, + .do_5 = xor_lasx_5, +}; + +#define XOR_SPEED_LASX() \ + do { \ + if (cpu_has_lasx) \ + xor_speed(&xor_block_lasx); \ + } while (0) +#else /* CONFIG_CPU_HAS_LASX */ +#define XOR_SPEED_LASX() +#endif /* CONFIG_CPU_HAS_LASX */ + +/* + * For grins, also test the generic routines. + * + * More importantly: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ +#include + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_8regs); \ + xor_speed(&xor_block_8regs_p); \ + xor_speed(&xor_block_32regs); \ + xor_speed(&xor_block_32regs_p); \ + XOR_SPEED_LSX(); \ + XOR_SPEED_LASX(); \ +} while (0) + +#endif /* _ASM_LOONGARCH_XOR_H */ diff --git a/arch/loongarch/include/asm/xor_simd.h b/arch/loongarch/include/asm/xor_simd.h new file mode 100644 index 000000000000..471b96332f38 --- /dev/null +++ b/arch/loongarch/include/asm/xor_simd.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui + */ +#ifndef _ASM_LOONGARCH_XOR_SIMD_H +#define _ASM_LOONGARCH_XOR_SIMD_H + +#ifdef CONFIG_CPU_HAS_LSX +void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3); +void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4, const unsigned long * __restrict p5); +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3); +void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4, const unsigned long * __restrict p5); +#endif /* CONFIG_CPU_HAS_LASX */ + +#endif /* _ASM_LOONGARCH_XOR_SIMD_H */ diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile index d60d4e096cfa..a77bf160bfc4 100644 --- a/arch/loongarch/lib/Makefile +++ b/arch/loongarch/lib/Makefile @@ -6,4 +6,6 @@ lib-y += delay.o memset.o memcpy.o memmove.o \ clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o +obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o + obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/loongarch/lib/xor_simd.c b/arch/loongarch/lib/xor_simd.c new file mode 100644 index 000000000000..84cd24b728c4 --- /dev/null +++ b/arch/loongarch/lib/xor_simd.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * LoongArch SIMD XOR operations + * + * Copyright (C) 2023 WANG Xuerui + */ + +#include "xor_simd.h" + +/* + * Process one cache line (64 bytes) per loop. This is assuming all future + * popular LoongArch cores are similar performance-characteristics-wise to the + * current models. + */ +#define LINE_WIDTH 64 + +#ifdef CONFIG_CPU_HAS_LSX + +#define LD(reg, base, offset) \ + "vld $vr" #reg ", %[" #base "], " #offset "\n\t" +#define ST(reg, base, offset) \ + "vst $vr" #reg ", %[" #base "], " #offset "\n\t" +#define XOR(dj, k) "vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t" + +#define LD_INOUT_LINE(base) \ + LD(0, base, 0) \ + LD(1, base, 16) \ + LD(2, base, 32) \ + LD(3, base, 48) + +#define LD_AND_XOR_LINE(base) \ + LD(4, base, 0) \ + LD(5, base, 16) \ + LD(6, base, 32) \ + LD(7, base, 48) \ + XOR(0, 4) \ + XOR(1, 5) \ + XOR(2, 6) \ + XOR(3, 7) + +#define ST_LINE(base) \ + ST(0, base, 0) \ + ST(1, base, 16) \ + ST(2, base, 32) \ + ST(3, base, 48) + +#define XOR_FUNC_NAME(nr) __xor_lsx_##nr +#include "xor_template.c" + +#undef LD +#undef ST +#undef XOR +#undef LD_INOUT_LINE +#undef LD_AND_XOR_LINE +#undef ST_LINE +#undef XOR_FUNC_NAME + +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX + +#define LD(reg, base, offset) \ + "xvld $xr" #reg ", %[" #base "], " #offset "\n\t" +#define ST(reg, base, offset) \ + "xvst $xr" #reg ", %[" #base "], " #offset "\n\t" +#define XOR(dj, k) "xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t" + +#define LD_INOUT_LINE(base) \ + LD(0, base, 0) \ + LD(1, base, 32) + +#define LD_AND_XOR_LINE(base) \ + LD(2, base, 0) \ + LD(3, base, 32) \ + XOR(0, 2) \ + XOR(1, 3) + +#define ST_LINE(base) \ + ST(0, base, 0) \ + ST(1, base, 32) + +#define XOR_FUNC_NAME(nr) __xor_lasx_##nr +#include "xor_template.c" + +#undef LD +#undef ST +#undef XOR +#undef LD_INOUT_LINE +#undef LD_AND_XOR_LINE +#undef ST_LINE +#undef XOR_FUNC_NAME + +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/arch/loongarch/lib/xor_simd.h b/arch/loongarch/lib/xor_simd.h new file mode 100644 index 000000000000..f50f32514d80 --- /dev/null +++ b/arch/loongarch/lib/xor_simd.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Simple interface to link xor_simd.c and xor_simd_glue.c + * + * Separating these files ensures that no SIMD instructions are run outside of + * the kfpu critical section. + */ + +#ifndef __LOONGARCH_LIB_XOR_SIMD_H +#define __LOONGARCH_LIB_XOR_SIMD_H + +#ifdef CONFIG_CPU_HAS_LSX +void __xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void __xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3); +void __xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void __xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4, const unsigned long * __restrict p5); +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +void __xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2); +void __xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3); +void __xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4); +void __xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, const unsigned long * __restrict p3, + const unsigned long * __restrict p4, const unsigned long * __restrict p5); +#endif /* CONFIG_CPU_HAS_LASX */ + +#endif /* __LOONGARCH_LIB_XOR_SIMD_H */ diff --git a/arch/loongarch/lib/xor_simd_glue.c b/arch/loongarch/lib/xor_simd_glue.c new file mode 100644 index 000000000000..393f689dbcf6 --- /dev/null +++ b/arch/loongarch/lib/xor_simd_glue.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * LoongArch SIMD XOR operations + * + * Copyright (C) 2023 WANG Xuerui + */ + +#include +#include +#include +#include +#include "xor_simd.h" + +#define MAKE_XOR_GLUE_2(flavor) \ +void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1, \ + const unsigned long * __restrict p2) \ +{ \ + kernel_fpu_begin(); \ + __xor_##flavor##_2(bytes, p1, p2); \ + kernel_fpu_end(); \ +} \ +EXPORT_SYMBOL_GPL(xor_##flavor##_2) + +#define MAKE_XOR_GLUE_3(flavor) \ +void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1, \ + const unsigned long * __restrict p2, \ + const unsigned long * __restrict p3) \ +{ \ + kernel_fpu_begin(); \ + __xor_##flavor##_3(bytes, p1, p2, p3); \ + kernel_fpu_end(); \ +} \ +EXPORT_SYMBOL_GPL(xor_##flavor##_3) + +#define MAKE_XOR_GLUE_4(flavor) \ +void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1, \ + const unsigned long * __restrict p2, \ + const unsigned long * __restrict p3, \ + const unsigned long * __restrict p4) \ +{ \ + kernel_fpu_begin(); \ + __xor_##flavor##_4(bytes, p1, p2, p3, p4); \ + kernel_fpu_end(); \ +} \ +EXPORT_SYMBOL_GPL(xor_##flavor##_4) + +#define MAKE_XOR_GLUE_5(flavor) \ +void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1, \ + const unsigned long * __restrict p2, \ + const unsigned long * __restrict p3, \ + const unsigned long * __restrict p4, \ + const unsigned long * __restrict p5) \ +{ \ + kernel_fpu_begin(); \ + __xor_##flavor##_5(bytes, p1, p2, p3, p4, p5); \ + kernel_fpu_end(); \ +} \ +EXPORT_SYMBOL_GPL(xor_##flavor##_5) + +#define MAKE_XOR_GLUES(flavor) \ + MAKE_XOR_GLUE_2(flavor); \ + MAKE_XOR_GLUE_3(flavor); \ + MAKE_XOR_GLUE_4(flavor); \ + MAKE_XOR_GLUE_5(flavor) + +#ifdef CONFIG_CPU_HAS_LSX +MAKE_XOR_GLUES(lsx); +#endif + +#ifdef CONFIG_CPU_HAS_LASX +MAKE_XOR_GLUES(lasx); +#endif diff --git a/arch/loongarch/lib/xor_template.c b/arch/loongarch/lib/xor_template.c new file mode 100644 index 000000000000..0358ced7fe33 --- /dev/null +++ b/arch/loongarch/lib/xor_template.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 WANG Xuerui + * + * Template for XOR operations, instantiated in xor_simd.c. + * + * Expected preprocessor definitions: + * + * - LINE_WIDTH + * - XOR_FUNC_NAME(nr) + * - LD_INOUT_LINE(buf) + * - LD_AND_XOR_LINE(buf) + * - ST_LINE(buf) + */ + +void XOR_FUNC_NAME(2)(unsigned long bytes, + unsigned long * __restrict v1, + const unsigned long * __restrict v2) +{ + unsigned long lines = bytes / LINE_WIDTH; + + do { + __asm__ __volatile__ ( + LD_INOUT_LINE(v1) + LD_AND_XOR_LINE(v2) + ST_LINE(v1) + : : [v1] "r"(v1), [v2] "r"(v2) : "memory" + ); + + v1 += LINE_WIDTH / sizeof(unsigned long); + v2 += LINE_WIDTH / sizeof(unsigned long); + } while (--lines > 0); +} + +void XOR_FUNC_NAME(3)(unsigned long bytes, + unsigned long * __restrict v1, + const unsigned long * __restrict v2, + const unsigned long * __restrict v3) +{ + unsigned long lines = bytes / LINE_WIDTH; + + do { + __asm__ __volatile__ ( + LD_INOUT_LINE(v1) + LD_AND_XOR_LINE(v2) + LD_AND_XOR_LINE(v3) + ST_LINE(v1) + : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3) : "memory" + ); + + v1 += LINE_WIDTH / sizeof(unsigned long); + v2 += LINE_WIDTH / sizeof(unsigned long); + v3 += LINE_WIDTH / sizeof(unsigned long); + } while (--lines > 0); +} + +void XOR_FUNC_NAME(4)(unsigned long bytes, + unsigned long * __restrict v1, + const unsigned long * __restrict v2, + const unsigned long * __restrict v3, + const unsigned long * __restrict v4) +{ + unsigned long lines = bytes / LINE_WIDTH; + + do { + __asm__ __volatile__ ( + LD_INOUT_LINE(v1) + LD_AND_XOR_LINE(v2) + LD_AND_XOR_LINE(v3) + LD_AND_XOR_LINE(v4) + ST_LINE(v1) + : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4) + : "memory" + ); + + v1 += LINE_WIDTH / sizeof(unsigned long); + v2 += LINE_WIDTH / sizeof(unsigned long); + v3 += LINE_WIDTH / sizeof(unsigned long); + v4 += LINE_WIDTH / sizeof(unsigned long); + } while (--lines > 0); +} + +void XOR_FUNC_NAME(5)(unsigned long bytes, + unsigned long * __restrict v1, + const unsigned long * __restrict v2, + const unsigned long * __restrict v3, + const unsigned long * __restrict v4, + const unsigned long * __restrict v5) +{ + unsigned long lines = bytes / LINE_WIDTH; + + do { + __asm__ __volatile__ ( + LD_INOUT_LINE(v1) + LD_AND_XOR_LINE(v2) + LD_AND_XOR_LINE(v3) + LD_AND_XOR_LINE(v4) + LD_AND_XOR_LINE(v5) + ST_LINE(v1) + : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4), + [v5] "r"(v5) : "memory" + ); + + v1 += LINE_WIDTH / sizeof(unsigned long); + v2 += LINE_WIDTH / sizeof(unsigned long); + v3 += LINE_WIDTH / sizeof(unsigned long); + v4 += LINE_WIDTH / sizeof(unsigned long); + v5 += LINE_WIDTH / sizeof(unsigned long); + } while (--lines > 0); +} From 8f3f06dfd6873135068ccf1a0b386308e8c4da38 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 11/25] raid6: Add LoongArch SIMD syndrome calculation The algorithms work on 64 bytes at a time, which is the L1 cache line size of all current and future LoongArch cores (that we care about), as confirmed by Huacai. The code is based on the generic int.uc algorithm, unrolled 4 times for LSX and 2 times for LASX. Further unrolling does not meaningfully improve the performance according to experiments. Performance numbers measured during system boot on a 3A5000 @ 2.5GHz: > raid6: lasx gen() 12726 MB/s > raid6: lsx gen() 10001 MB/s > raid6: int64x8 gen() 2876 MB/s > raid6: int64x4 gen() 3867 MB/s > raid6: int64x2 gen() 2531 MB/s > raid6: int64x1 gen() 1945 MB/s Comparison of xor() speeds (from different boots but meaningful anyway): > lasx: 11226 MB/s > lsx: 6395 MB/s > int64x4: 2147 MB/s Performance as measured by raid6test: > raid6: lasx gen() 25109 MB/s > raid6: lsx gen() 13233 MB/s > raid6: int64x8 gen() 4164 MB/s > raid6: int64x4 gen() 6005 MB/s > raid6: int64x2 gen() 5781 MB/s > raid6: int64x1 gen() 4119 MB/s > raid6: using algorithm lasx gen() 25109 MB/s > raid6: .... xor() 14439 MB/s, rmw enabled Acked-by: Song Liu Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- include/linux/raid/pq.h | 2 + lib/raid6/Makefile | 1 + lib/raid6/algos.c | 8 + lib/raid6/loongarch.h | 38 ++++ lib/raid6/loongarch_simd.c | 422 +++++++++++++++++++++++++++++++++++++ lib/raid6/test/Makefile | 12 ++ 6 files changed, 483 insertions(+) create mode 100644 lib/raid6/loongarch.h create mode 100644 lib/raid6/loongarch_simd.c diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index f29aaaf2eb21..874447485848 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -108,6 +108,8 @@ extern const struct raid6_calls raid6_vpermxor1; extern const struct raid6_calls raid6_vpermxor2; extern const struct raid6_calls raid6_vpermxor4; extern const struct raid6_calls raid6_vpermxor8; +extern const struct raid6_calls raid6_lsx; +extern const struct raid6_calls raid6_lasx; struct raid6_recov_calls { void (*data2)(int, size_t, int, int, void **); diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 45e17619422b..2b9ebe105480 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -9,6 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o hostprogs += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index a22a05c9af8a..739c7ebcae1a 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -73,6 +73,14 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_neonx2, &raid6_neonx1, #endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_lsx, +#endif +#endif #if defined(__ia64__) &raid6_intx32, &raid6_intx16, diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h new file mode 100644 index 000000000000..acfc33ce7056 --- /dev/null +++ b/lib/raid6/loongarch.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui + * + * raid6/loongarch.h + * + * Definitions common to LoongArch RAID-6 code only + */ + +#ifndef _LIB_RAID6_LOONGARCH_H +#define _LIB_RAID6_LOONGARCH_H + +#ifdef __KERNEL__ + +#include +#include + +#else /* for user-space testing */ + +#include + +/* have to supply these defines for glibc 2.37- and musl */ +#ifndef HWCAP_LOONGARCH_LSX +#define HWCAP_LOONGARCH_LSX (1 << 4) +#endif +#ifndef HWCAP_LOONGARCH_LASX +#define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + +#define kernel_fpu_begin() +#define kernel_fpu_end() + +#define cpu_has_lsx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX) +#define cpu_has_lasx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX) + +#endif /* __KERNEL__ */ + +#endif /* _LIB_RAID6_LOONGARCH_H */ diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c new file mode 100644 index 000000000000..aa5d9f924ca3 --- /dev/null +++ b/lib/raid6/loongarch_simd.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX) + * + * Copyright 2023 WANG Xuerui + * + * Based on the generic RAID-6 code (int.uc): + * + * Copyright 2002-2004 H. Peter Anvin + */ + +#include +#include "loongarch.h" + +/* + * The vector algorithms are currently priority 0, which means the generic + * scalar algorithms are not being disabled if vector support is present. + * This is like the similar LoongArch RAID5 XOR code, with the main reason + * repeated here: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +#ifdef CONFIG_CPU_HAS_LSX +#define NSIZE 16 + +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1])); + asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2])); + asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1])); + asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2])); + asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3])); + } + + kernel_fpu_end(); +} + +static void raid6_lsx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr12"); + asm volatile("vxor.v $vr5, $vr17, $vr13"); + asm volatile("vxor.v $vr6, $vr18, $vr14"); + asm volatile("vxor.v $vr7, $vr19, $vr15"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "vld $vr20, %0\n\t" + "vld $vr21, %1\n\t" + "vld $vr22, %2\n\t" + "vld $vr23, %3\n\t" + "vld $vr24, %4\n\t" + "vld $vr25, %5\n\t" + "vld $vr26, %6\n\t" + "vld $vr27, %7\n\t" + "vxor.v $vr20, $vr20, $vr0\n\t" + "vxor.v $vr21, $vr21, $vr1\n\t" + "vxor.v $vr22, $vr22, $vr2\n\t" + "vxor.v $vr23, $vr23, $vr3\n\t" + "vxor.v $vr24, $vr24, $vr4\n\t" + "vxor.v $vr25, $vr25, $vr5\n\t" + "vxor.v $vr26, $vr26, $vr6\n\t" + "vxor.v $vr27, $vr27, $vr7\n\t" + "vst $vr20, %0\n\t" + "vst $vr21, %1\n\t" + "vst $vr22, %2\n\t" + "vst $vr23, %3\n\t" + "vst $vr24, %4\n\t" + "vst $vr25, %5\n\t" + "vst $vr26, %6\n\t" + "vst $vr27, %7\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]), + "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lsx = { + raid6_lsx_gen_syndrome, + raid6_lsx_xor_syndrome, + raid6_has_lsx, + "lsx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; + +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +#define NSIZE 32 + +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1])); + } + + kernel_fpu_end(); +} + +static void raid6_lasx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr6"); + asm volatile("xvxor.v $xr3, $xr9, $xr7"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "xvld $xr10, %0\n\t" + "xvld $xr11, %1\n\t" + "xvld $xr12, %2\n\t" + "xvld $xr13, %3\n\t" + "xvxor.v $xr10, $xr10, $xr0\n\t" + "xvxor.v $xr11, $xr11, $xr1\n\t" + "xvxor.v $xr12, $xr12, $xr2\n\t" + "xvxor.v $xr13, $xr13, $xr3\n\t" + "xvst $xr10, %0\n\t" + "xvst $xr11, %1\n\t" + "xvst $xr12, %2\n\t" + "xvst $xr13, %3\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lasx = { + raid6_lasx_gen_syndrome, + raid6_lasx_xor_syndrome, + raid6_has_lasx, + "lasx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 1f693ea3b980..7b244bce32b3 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -41,6 +41,16 @@ ifeq ($(findstring ppc,$(ARCH)),ppc) gcc -c -x c - >/dev/null && rm ./-.o && echo yes) endif +ifeq ($(ARCH),loongarch64) + CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1 + CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1) + CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1) +endif + ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 @@ -54,6 +64,8 @@ else ifeq ($(HAS_ALTIVEC),yes) CFLAGS += -DCONFIG_ALTIVEC OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o +else ifeq ($(ARCH),loongarch64) + OBJS += loongarch_simd.o endif .c.o: From f2091321044d9fbcadb93dfc1c9cf23e563ea40c Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 12/25] raid6: Add LoongArch SIMD recovery implementation Similar to the syndrome calculation, the recovery algorithms also work on 64 bytes at a time to align with the L1 cache line size of current and future LoongArch cores (that we care about). Which means unrolled-by-4 LSX and unrolled-by-2 LASX code. The assembly is originally based on the x86 SSSE3/AVX2 ports, but register allocation has been redone to take advantage of LSX/LASX's 32 vector registers, and instruction sequence has been optimized to suit (e.g. LoongArch can perform per-byte srl and andi on vectors, but x86 cannot). Performance numbers measured by instrumenting the raid6test code, on a 3A5000 system clocked at 2.5GHz: > lasx 2data: 354.987 MiB/s > lasx datap: 350.430 MiB/s > lsx 2data: 340.026 MiB/s > lsx datap: 337.318 MiB/s > intx1 2data: 164.280 MiB/s > intx1 datap: 187.966 MiB/s Because recovery algorithms are chosen solely based on priority and availability, lasx is marked as priority 2 and lsx priority 1. At least for the current generation of LoongArch micro-architectures, LASX should always be faster than LSX whenever supported, and have similar power consumption characteristics (because the only known LASX-capable uarch, the LA464, always compute the full 256-bit result for vector ops). Acked-by: Song Liu Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- include/linux/raid/pq.h | 2 + lib/raid6/Makefile | 2 +- lib/raid6/algos.c | 8 + lib/raid6/recov_loongarch_simd.c | 513 +++++++++++++++++++++++++++++++ lib/raid6/test/Makefile | 2 +- 5 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 lib/raid6/recov_loongarch_simd.c diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 874447485848..006e18decfad 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -125,6 +125,8 @@ extern const struct raid6_recov_calls raid6_recov_avx2; extern const struct raid6_recov_calls raid6_recov_avx512; extern const struct raid6_recov_calls raid6_recov_s390xc; extern const struct raid6_recov_calls raid6_recov_neon; +extern const struct raid6_recov_calls raid6_recov_lsx; +extern const struct raid6_recov_calls raid6_recov_lasx; extern const struct raid6_calls raid6_neonx1; extern const struct raid6_calls raid6_neonx2; diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 2b9ebe105480..035b0a4db476 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -9,7 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o -raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o hostprogs += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 739c7ebcae1a..0ec534faf019 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -111,6 +111,14 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #endif #if defined(CONFIG_KERNEL_MODE_NEON) &raid6_recov_neon, +#endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_recov_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_recov_lsx, +#endif #endif &raid6_recov_intx1, NULL diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c new file mode 100644 index 000000000000..94aeac85e6f7 --- /dev/null +++ b/lib/raid6/recov_loongarch_simd.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX) + * + * Copyright (C) 2023 WANG Xuerui + * + * Originally based on recov_avx2.c and recov_ssse3.c: + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas + */ + +#include +#include "loongarch.h" + +/* + * Unlike with the syndrome calculation algorithms, there's no boot-time + * selection of recovery algorithms by benchmarking, so we have to specify + * the priorities and hope the future cores will all have decent vector + * support (i.e. no LASX slower than LSX, or even scalar code). + */ + +#ifdef CONFIG_CPU_HAS_LSX +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * vr20, vr21: qmul + * vr22, vr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + + while (bytes) { + /* vr4 - vr7: Q */ + asm volatile("vld $vr4, %0" : : "m" (q[0])); + asm volatile("vld $vr5, %0" : : "m" (q[16])); + asm volatile("vld $vr6, %0" : : "m" (q[32])); + asm volatile("vld $vr7, %0" : : "m" (q[48])); + /* vr4 - vr7: Q + Qxy */ + asm volatile("vld $vr8, %0" : : "m" (dq[0])); + asm volatile("vld $vr9, %0" : : "m" (dq[16])); + asm volatile("vld $vr10, %0" : : "m" (dq[32])); + asm volatile("vld $vr11, %0" : : "m" (dq[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + /* vr0 - vr3: P */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr0 - vr3: P + Pxy */ + asm volatile("vld $vr8, %0" : : "m" (dp[0])); + asm volatile("vld $vr9, %0" : : "m" (dp[16])); + asm volatile("vld $vr10, %0" : : "m" (dp[32])); + asm volatile("vld $vr11, %0" : : "m" (dp[48])); + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4"); + asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5"); + asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6"); + asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8"); + asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9"); + asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10"); + asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11"); + /* vr16 - vr19: B(Q + Qxy) */ + asm volatile("vxor.v $vr16, $vr8, $vr4"); + asm volatile("vxor.v $vr17, $vr9, $vr5"); + asm volatile("vxor.v $vr18, $vr10, $vr6"); + asm volatile("vxor.v $vr19, $vr11, $vr7"); + + /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("vsrli.b $vr4, $vr0, 4"); + asm volatile("vsrli.b $vr5, $vr1, 4"); + asm volatile("vsrli.b $vr6, $vr2, 4"); + asm volatile("vsrli.b $vr7, $vr3, 4"); + /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("vandi.b $vr12, $vr0, 0x0f"); + asm volatile("vandi.b $vr13, $vr1, 0x0f"); + asm volatile("vandi.b $vr14, $vr2, 0x0f"); + asm volatile("vandi.b $vr15, $vr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12"); + asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13"); + asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14"); + asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15"); + /* lookup from pbmul[16] */ + asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4"); + asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5"); + asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6"); + asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7"); + /* vr4 - vr7: A(P + Pxy) */ + asm volatile("vxor.v $vr4, $vr4, $vr12"); + asm volatile("vxor.v $vr5, $vr5, $vr13"); + asm volatile("vxor.v $vr6, $vr6, $vr14"); + asm volatile("vxor.v $vr7, $vr7, $vr15"); + + /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr16"); + asm volatile("vxor.v $vr5, $vr5, $vr17"); + asm volatile("vxor.v $vr6, $vr6, $vr18"); + asm volatile("vxor.v $vr7, $vr7, $vr19"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Pxy + Dx = Dy */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (dp[0])); + asm volatile("vst $vr1, %0" : "=m" (dp[16])); + asm volatile("vst $vr2, %0" : "=m" (dp[32])); + asm volatile("vst $vr3, %0" : "=m" (dp[48])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* vr22, vr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + + while (bytes) { + /* vr0 - vr3: P + Dx */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr4 - vr7: Qx */ + asm volatile("vld $vr4, %0" : : "m" (dq[0])); + asm volatile("vld $vr5, %0" : : "m" (dq[16])); + asm volatile("vld $vr6, %0" : : "m" (dq[32])); + asm volatile("vld $vr7, %0" : : "m" (dq[48])); + /* vr4 - vr7: Q + Qx */ + asm volatile("vld $vr8, %0" : : "m" (q[0])); + asm volatile("vld $vr9, %0" : : "m" (q[16])); + asm volatile("vld $vr10, %0" : : "m" (q[32])); + asm volatile("vld $vr11, %0" : : "m" (q[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4"); + asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5"); + asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6"); + asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8"); + asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9"); + asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10"); + asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11"); + /* vr4 - vr7: qmul(Q + Qx) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Dx + Dx = P */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (p[0])); + asm volatile("vst $vr1, %0" : "=m" (p[16])); + asm volatile("vst $vr2, %0" : "=m" (p[32])); + asm volatile("vst $vr3, %0" : "=m" (p[48])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lsx = { + .data2 = raid6_2data_recov_lsx, + .datap = raid6_datap_recov_lsx, + .valid = raid6_has_lsx, + .name = "lsx", + .priority = 1, +}; +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * xr20, xr21: qmul + * xr22, xr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + asm volatile("xvreplve0.q $xr20, $xr20"); + asm volatile("xvreplve0.q $xr21, $xr21"); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: Q */ + asm volatile("xvld $xr0, %0" : : "m" (q[0])); + asm volatile("xvld $xr1, %0" : : "m" (q[32])); + /* xr0, xr1: Q + Qxy */ + asm volatile("xvld $xr4, %0" : : "m" (dq[0])); + asm volatile("xvld $xr5, %0" : : "m" (dq[32])); + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* xr2, xr3: P */ + asm volatile("xvld $xr2, %0" : : "m" (p[0])); + asm volatile("xvld $xr3, %0" : : "m" (p[32])); + /* xr2, xr3: P + Pxy */ + asm volatile("xvld $xr4, %0" : : "m" (dp[0])); + asm volatile("xvld $xr5, %0" : : "m" (dp[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvsrli.b $xr4, $xr0, 4"); + asm volatile("xvsrli.b $xr5, $xr1, 4"); + /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvandi.b $xr0, $xr0, 0x0f"); + asm volatile("xvandi.b $xr1, $xr1, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0"); + asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4"); + asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5"); + /* xr6, xr7: B(Q + Qxy) */ + asm volatile("xvxor.v $xr6, $xr4, $xr0"); + asm volatile("xvxor.v $xr7, $xr5, $xr1"); + + /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("xvandi.b $xr0, $xr2, 0x0f"); + asm volatile("xvandi.b $xr1, $xr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0"); + asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1"); + /* lookup from pbmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr0, xr1: A(P + Pxy) */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + + /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("xvxor.v $xr0, $xr0, $xr6"); + asm volatile("xvxor.v $xr1, $xr1, $xr7"); + + /* xr2, xr3: P + Pxy + Dx = Dy */ + asm volatile("xvxor.v $xr2, $xr2, $xr0"); + asm volatile("xvxor.v $xr3, $xr3, $xr1"); + + asm volatile("xvst $xr0, %0" : "=m" (dq[0])); + asm volatile("xvst $xr1, %0" : "=m" (dq[32])); + asm volatile("xvst $xr2, %0" : "=m" (dp[0])); + asm volatile("xvst $xr3, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* xr22, xr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: P + Dx */ + asm volatile("xvld $xr0, %0" : : "m" (p[0])); + asm volatile("xvld $xr1, %0" : : "m" (p[32])); + /* xr2, xr3: Qx */ + asm volatile("xvld $xr2, %0" : : "m" (dq[0])); + asm volatile("xvld $xr3, %0" : : "m" (dq[32])); + /* xr2, xr3: Q + Qx */ + asm volatile("xvld $xr4, %0" : : "m" (q[0])); + asm volatile("xvld $xr5, %0" : : "m" (q[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("xvandi.b $xr2, $xr2, 0x0f"); + asm volatile("xvandi.b $xr3, $xr3, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2"); + asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr2, xr3: qmul(Q + Qx) = Dx */ + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr0, xr1: P + Dx + Dx = P */ + asm volatile("xvxor.v $xr0, $xr0, $xr2"); + asm volatile("xvxor.v $xr1, $xr1, $xr3"); + + asm volatile("xvst $xr2, %0" : "=m" (dq[0])); + asm volatile("xvst $xr3, %0" : "=m" (dq[32])); + asm volatile("xvst $xr0, %0" : "=m" (p[0])); + asm volatile("xvst $xr1, %0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lasx = { + .data2 = raid6_2data_recov_lasx, + .datap = raid6_datap_recov_lasx, + .valid = raid6_has_lasx, + .name = "lasx", + .priority = 2, +}; +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 7b244bce32b3..2abe0076a636 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -65,7 +65,7 @@ else ifeq ($(HAS_ALTIVEC),yes) OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o else ifeq ($(ARCH),loongarch64) - OBJS += loongarch_simd.o + OBJS += loongarch_simd.o recov_loongarch_simd.o endif .c.o: From bd3c5798484aa9a08302a844d7a75a2ee3b53d05 Mon Sep 17 00:00:00 2001 From: Qi Hu Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 13/25] LoongArch: Add Loongson Binary Translation (LBT) extension support Loongson Binary Translation (LBT) is used to accelerate binary translation, which contains 4 scratch registers (scr0 to scr3), x86/ARM eflags (eflags) and x87 fpu stack pointer (ftop). This patch support kernel to save/restore these registers, handle the LBT exception and maintain sigcontext. Signed-off-by: Qi Hu Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 15 ++ arch/loongarch/include/asm/asm-prototypes.h | 1 + arch/loongarch/include/asm/asmmacro.h | 51 ++++- arch/loongarch/include/asm/lbt.h | 109 +++++++++++ arch/loongarch/include/asm/loongarch.h | 4 + arch/loongarch/include/asm/processor.h | 26 +-- arch/loongarch/include/asm/switch_to.h | 2 + arch/loongarch/include/asm/thread_info.h | 4 + arch/loongarch/include/uapi/asm/ptrace.h | 6 + arch/loongarch/include/uapi/asm/sigcontext.h | 10 + arch/loongarch/kernel/Makefile | 2 + arch/loongarch/kernel/asm-offsets.c | 18 +- arch/loongarch/kernel/cpu-probe.c | 14 ++ arch/loongarch/kernel/fpu.S | 9 +- arch/loongarch/kernel/lbt.S | 155 +++++++++++++++ arch/loongarch/kernel/process.c | 15 +- arch/loongarch/kernel/ptrace.c | 54 ++++++ arch/loongarch/kernel/signal.c | 188 +++++++++++++++++++ arch/loongarch/kernel/traps.c | 41 +++- 19 files changed, 693 insertions(+), 31 deletions(-) create mode 100644 arch/loongarch/include/asm/lbt.h create mode 100644 arch/loongarch/kernel/lbt.S diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 465759f6b0ed..21cc2e2d1f27 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -254,6 +254,9 @@ config AS_HAS_LSX_EXTENSION config AS_HAS_LASX_EXTENSION def_bool $(as-instr,xvld \$xr0$(comma)\$a0$(comma)0) +config AS_HAS_LBT_EXTENSION + def_bool $(as-instr,movscr2gr \$a0$(comma)\$scr0) + menu "Kernel type and options" source "kernel/Kconfig.hz" @@ -534,6 +537,18 @@ config CPU_HAS_LASX If unsure, say Y. +config CPU_HAS_LBT + bool "Support for the Loongson Binary Translation Extension" + depends on AS_HAS_LBT_EXTENSION + help + Loongson Binary Translation (LBT) introduces 4 scratch registers (SCR0 + to SCR3), x86/ARM eflags (eflags) and x87 fpu stack pointer (ftop). + Enabling this option allows the kernel to allocate and switch registers + specific to LBT. + + If you want to use this feature, such as the Loongson Architecture + Translator (LAT), say Y. + config CPU_HAS_PREFETCH bool default y diff --git a/arch/loongarch/include/asm/asm-prototypes.h b/arch/loongarch/include/asm/asm-prototypes.h index ed06d3997420..cf8e1a4e7c19 100644 --- a/arch/loongarch/include/asm/asm-prototypes.h +++ b/arch/loongarch/include/asm/asm-prototypes.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include #include #include diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h index af542a8d847f..c9544f358c33 100644 --- a/arch/loongarch/include/asm/asmmacro.h +++ b/arch/loongarch/include/asm/asmmacro.h @@ -41,12 +41,51 @@ .macro fpu_save_csr thread tmp movfcsr2gr \tmp, fcsr0 - stptr.w \tmp, \thread, THREAD_FCSR + stptr.w \tmp, \thread, THREAD_FCSR +#ifdef CONFIG_CPU_HAS_LBT + /* TM bit is always 0 if LBT not supported */ + andi \tmp, \tmp, FPU_CSR_TM + beqz \tmp, 1f + /* Save FTOP */ + x86mftop \tmp + stptr.w \tmp, \thread, THREAD_FTOP + /* Turn off TM to ensure the order of FPR in memory independent of TM */ + x86clrtm +1: +#endif .endm - .macro fpu_restore_csr thread tmp - ldptr.w \tmp, \thread, THREAD_FCSR - movgr2fcsr fcsr0, \tmp + .macro fpu_restore_csr thread tmp0 tmp1 + ldptr.w \tmp0, \thread, THREAD_FCSR + movgr2fcsr fcsr0, \tmp0 +#ifdef CONFIG_CPU_HAS_LBT + /* TM bit is always 0 if LBT not supported */ + andi \tmp0, \tmp0, FPU_CSR_TM + beqz \tmp0, 2f + /* Restore FTOP */ + ldptr.w \tmp0, \thread, THREAD_FTOP + andi \tmp0, \tmp0, 0x7 + la.pcrel \tmp1, 1f + alsl.d \tmp1, \tmp0, \tmp1, 3 + jr \tmp1 +1: + x86mttop 0 + b 2f + x86mttop 1 + b 2f + x86mttop 2 + b 2f + x86mttop 3 + b 2f + x86mttop 4 + b 2f + x86mttop 5 + b 2f + x86mttop 6 + b 2f + x86mttop 7 +2: +#endif .endm .macro fpu_save_cc thread tmp0 tmp1 @@ -246,7 +285,7 @@ .macro lsx_restore_all thread tmp0 tmp1 lsx_restore_data \thread, \tmp0 fpu_restore_cc \thread, \tmp0, \tmp1 - fpu_restore_csr \thread, \tmp0 + fpu_restore_csr \thread, \tmp0, \tmp1 .endm .macro lsx_save_upper vd base tmp off @@ -456,7 +495,7 @@ .macro lasx_restore_all thread tmp0 tmp1 lasx_restore_data \thread, \tmp0 fpu_restore_cc \thread, \tmp0, \tmp1 - fpu_restore_csr \thread, \tmp0 + fpu_restore_csr \thread, \tmp0, \tmp1 .endm .macro lasx_save_upper xd base tmp off diff --git a/arch/loongarch/include/asm/lbt.h b/arch/loongarch/include/asm/lbt.h new file mode 100644 index 000000000000..e671978bf552 --- /dev/null +++ b/arch/loongarch/include/asm/lbt.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Author: Qi Hu + * Huacai Chen + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ +#ifndef _ASM_LBT_H +#define _ASM_LBT_H + +#include +#include +#include +#include + +extern void _init_lbt(void); +extern void _save_lbt(struct loongarch_lbt *); +extern void _restore_lbt(struct loongarch_lbt *); + +static inline int is_lbt_enabled(void) +{ + if (!cpu_has_lbt) + return 0; + + return (csr_read32(LOONGARCH_CSR_EUEN) & CSR_EUEN_LBTEN) ? + 1 : 0; +} + +static inline int is_lbt_owner(void) +{ + return test_thread_flag(TIF_USEDLBT); +} + +#ifdef CONFIG_CPU_HAS_LBT + +static inline void enable_lbt(void) +{ + if (cpu_has_lbt) + csr_xchg32(CSR_EUEN_LBTEN, CSR_EUEN_LBTEN, LOONGARCH_CSR_EUEN); +} + +static inline void disable_lbt(void) +{ + if (cpu_has_lbt) + csr_xchg32(0, CSR_EUEN_LBTEN, LOONGARCH_CSR_EUEN); +} + +static inline void __own_lbt(void) +{ + enable_lbt(); + set_thread_flag(TIF_USEDLBT); + KSTK_EUEN(current) |= CSR_EUEN_LBTEN; +} + +static inline void own_lbt_inatomic(int restore) +{ + if (cpu_has_lbt && !is_lbt_owner()) { + __own_lbt(); + if (restore) + _restore_lbt(¤t->thread.lbt); + } +} + +static inline void own_lbt(int restore) +{ + preempt_disable(); + own_lbt_inatomic(restore); + preempt_enable(); +} + +static inline void lose_lbt_inatomic(int save, struct task_struct *tsk) +{ + if (cpu_has_lbt && is_lbt_owner()) { + if (save) + _save_lbt(&tsk->thread.lbt); + + disable_lbt(); + clear_tsk_thread_flag(tsk, TIF_USEDLBT); + } + KSTK_EUEN(tsk) &= ~(CSR_EUEN_LBTEN); +} + +static inline void lose_lbt(int save) +{ + preempt_disable(); + lose_lbt_inatomic(save, current); + preempt_enable(); +} + +static inline void init_lbt(void) +{ + __own_lbt(); + _init_lbt(); +} +#else +static inline void own_lbt_inatomic(int restore) {} +static inline void lose_lbt_inatomic(int save, struct task_struct *tsk) {} +static inline void init_lbt(void) {} +static inline void lose_lbt(int save) {} +#endif + +static inline int thread_lbt_context_live(void) +{ + if (!cpu_has_lbt) + return 0; + + return test_thread_flag(TIF_LBT_CTX_LIVE); +} + +#endif /* _ASM_LBT_H */ diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index a500efe0fd92..33531d432b49 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -1410,6 +1410,10 @@ __BUILD_CSR_OP(tlbidx) #define FPU_CSR_RU 0x200 /* towards +Infinity */ #define FPU_CSR_RD 0x300 /* towards -Infinity */ +/* Bit 6 of FPU Status Register specify the LBT TOP simulation mode */ +#define FPU_CSR_TM_SHIFT 0x6 +#define FPU_CSR_TM (_ULCAST_(1) << FPU_CSR_TM_SHIFT) + #define read_fcsr(source) \ ({ \ unsigned int __res; \ diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 636e1c66398c..c3bc44b5f5b3 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -80,11 +80,22 @@ BUILD_FPR_ACCESS(32) BUILD_FPR_ACCESS(64) struct loongarch_fpu { - unsigned int fcsr; uint64_t fcc; /* 8x8 */ + uint32_t fcsr; + uint32_t ftop; union fpureg fpr[NUM_FPU_REGS]; }; +struct loongarch_lbt { + /* Scratch registers */ + unsigned long scr0; + unsigned long scr1; + unsigned long scr2; + unsigned long scr3; + /* Eflags register */ + unsigned long eflags; +}; + #define INIT_CPUMASK { \ {0,} \ } @@ -113,15 +124,6 @@ struct thread_struct { unsigned long csr_ecfg; unsigned long csr_badvaddr; /* Last user fault */ - /* Scratch registers */ - unsigned long scr0; - unsigned long scr1; - unsigned long scr2; - unsigned long scr3; - - /* Eflags register */ - unsigned long eflags; - /* Other stuff associated with the thread. */ unsigned long trap_nr; unsigned long error_code; @@ -133,6 +135,7 @@ struct thread_struct { * context because they are conditionally copied at fork(). */ struct loongarch_fpu fpu FPU_ALIGN; + struct loongarch_lbt lbt; /* Also conditionally copied */ /* Hardware breakpoints pinned to this task. */ struct perf_event *hbp_break[LOONGARCH_MAX_BRP]; @@ -174,8 +177,9 @@ struct thread_struct { * FPU & vector registers \ */ \ .fpu = { \ - .fcsr = 0, \ .fcc = 0, \ + .fcsr = 0, \ + .ftop = 0, \ .fpr = {{{0,},},}, \ }, \ .hbp_break = {0}, \ diff --git a/arch/loongarch/include/asm/switch_to.h b/arch/loongarch/include/asm/switch_to.h index 24e3094bebab..5b225aff3ba2 100644 --- a/arch/loongarch/include/asm/switch_to.h +++ b/arch/loongarch/include/asm/switch_to.h @@ -7,6 +7,7 @@ #include #include +#include struct task_struct; @@ -34,6 +35,7 @@ extern asmlinkage struct task_struct *__switch_to(struct task_struct *prev, #define switch_to(prev, next, last) \ do { \ lose_fpu_inatomic(1, prev); \ + lose_lbt_inatomic(1, prev); \ hw_breakpoint_thread_switch(next); \ (last) = __switch_to(prev, next, task_thread_info(next), \ __builtin_return_address(0), __builtin_frame_address(0)); \ diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 1a3354ca056e..8cb653d49a54 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -84,6 +84,8 @@ register unsigned long current_stack_pointer __asm__("$sp"); #define TIF_SINGLESTEP 16 /* Single Step */ #define TIF_LSX_CTX_LIVE 17 /* LSX context must be preserved */ #define TIF_LASX_CTX_LIVE 18 /* LASX context must be preserved */ +#define TIF_USEDLBT 19 /* LBT was used by this task this quantum (SMP) */ +#define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ #define _TIF_SIGPENDING (1<options |= LOONGARCH_CPU_LVZ; elf_hwcap |= HWCAP_LOONGARCH_LVZ; } +#ifdef CONFIG_CPU_HAS_LBT + if (config & CPUCFG2_X86BT) { + c->options |= LOONGARCH_CPU_LBT_X86; + elf_hwcap |= HWCAP_LOONGARCH_LBT_X86; + } + if (config & CPUCFG2_ARMBT) { + c->options |= LOONGARCH_CPU_LBT_ARM; + elf_hwcap |= HWCAP_LOONGARCH_LBT_ARM; + } + if (config & CPUCFG2_MIPSBT) { + c->options |= LOONGARCH_CPU_LBT_MIPS; + elf_hwcap |= HWCAP_LOONGARCH_LBT_MIPS; + } +#endif config = read_cpucfg(LOONGARCH_CPUCFG6); if (config & CPUCFG6_PMP) diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 80dccf8e39a7..d53ab10f4644 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -138,6 +138,13 @@ .macro sc_save_fcsr base, tmp0 movfcsr2gr \tmp0, fcsr0 EX st.w \tmp0, \base, 0 +#if defined(CONFIG_CPU_HAS_LBT) + /* TM bit is always 0 if LBT not supported */ + andi \tmp0, \tmp0, FPU_CSR_TM + beqz \tmp0, 1f + x86clrtm +1: +#endif .endm .macro sc_restore_fcsr base, tmp0 @@ -309,7 +316,7 @@ EXPORT_SYMBOL(_save_fp) */ SYM_FUNC_START(_restore_fp) fpu_restore_double a0 t1 # clobbers t1 - fpu_restore_csr a0 t1 + fpu_restore_csr a0 t1 t2 fpu_restore_cc a0 t1 t2 # clobbers t1, t2 jr ra SYM_FUNC_END(_restore_fp) diff --git a/arch/loongarch/kernel/lbt.S b/arch/loongarch/kernel/lbt.S new file mode 100644 index 000000000000..9c75120a26d8 --- /dev/null +++ b/arch/loongarch/kernel/lbt.S @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Author: Qi Hu + * Huacai Chen + * + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ +#include +#include +#include +#include +#include +#include + +#define SCR_REG_WIDTH 8 + + .macro EX insn, reg, src, offs +.ex\@: \insn \reg, \src, \offs + _asm_extable .ex\@, .L_lbt_fault + .endm + +/* + * Save a thread's lbt context. + */ +SYM_FUNC_START(_save_lbt) + movscr2gr t1, $scr0 # save scr + stptr.d t1, a0, THREAD_SCR0 + movscr2gr t1, $scr1 + stptr.d t1, a0, THREAD_SCR1 + movscr2gr t1, $scr2 + stptr.d t1, a0, THREAD_SCR2 + movscr2gr t1, $scr3 + stptr.d t1, a0, THREAD_SCR3 + + x86mfflag t1, 0x3f # save eflags + stptr.d t1, a0, THREAD_EFLAGS + jr ra +SYM_FUNC_END(_save_lbt) +EXPORT_SYMBOL(_save_lbt) + +/* + * Restore a thread's lbt context. + */ +SYM_FUNC_START(_restore_lbt) + ldptr.d t1, a0, THREAD_SCR0 # restore scr + movgr2scr $scr0, t1 + ldptr.d t1, a0, THREAD_SCR1 + movgr2scr $scr1, t1 + ldptr.d t1, a0, THREAD_SCR2 + movgr2scr $scr2, t1 + ldptr.d t1, a0, THREAD_SCR3 + movgr2scr $scr3, t1 + + ldptr.d t1, a0, THREAD_EFLAGS # restore eflags + x86mtflag t1, 0x3f + jr ra +SYM_FUNC_END(_restore_lbt) +EXPORT_SYMBOL(_restore_lbt) + +/* + * Load scr/eflag with zero. + */ +SYM_FUNC_START(_init_lbt) + movgr2scr $scr0, zero + movgr2scr $scr1, zero + movgr2scr $scr2, zero + movgr2scr $scr3, zero + + x86mtflag zero, 0x3f + jr ra +SYM_FUNC_END(_init_lbt) + +/* + * a0: scr + * a1: eflag + */ +SYM_FUNC_START(_save_lbt_context) + movscr2gr t1, $scr0 # save scr + EX st.d t1, a0, (0 * SCR_REG_WIDTH) + movscr2gr t1, $scr1 + EX st.d t1, a0, (1 * SCR_REG_WIDTH) + movscr2gr t1, $scr2 + EX st.d t1, a0, (2 * SCR_REG_WIDTH) + movscr2gr t1, $scr3 + EX st.d t1, a0, (3 * SCR_REG_WIDTH) + + x86mfflag t1, 0x3f # save eflags + EX st.w t1, a1, 0 + li.w a0, 0 # success + jr ra +SYM_FUNC_END(_save_lbt_context) + +/* + * a0: scr + * a1: eflag + */ +SYM_FUNC_START(_restore_lbt_context) + EX ld.d t1, a0, (0 * SCR_REG_WIDTH) # restore scr + movgr2scr $scr0, t1 + EX ld.d t1, a0, (1 * SCR_REG_WIDTH) + movgr2scr $scr1, t1 + EX ld.d t1, a0, (2 * SCR_REG_WIDTH) + movgr2scr $scr2, t1 + EX ld.d t1, a0, (3 * SCR_REG_WIDTH) + movgr2scr $scr3, t1 + + EX ld.w t1, a1, 0 # restore eflags + x86mtflag t1, 0x3f + li.w a0, 0 # success + jr ra +SYM_FUNC_END(_restore_lbt_context) + +/* + * a0: ftop + */ +SYM_FUNC_START(_save_ftop_context) + x86mftop t1 + st.w t1, a0, 0 + li.w a0, 0 # success + jr ra +SYM_FUNC_END(_save_ftop_context) + +/* + * a0: ftop + */ +SYM_FUNC_START(_restore_ftop_context) + ld.w t1, a0, 0 + andi t1, t1, 0x7 + la.pcrel a0, 1f + alsl.d a0, t1, a0, 3 + jr a0 +1: + x86mttop 0 + b 2f + x86mttop 1 + b 2f + x86mttop 2 + b 2f + x86mttop 3 + b 2f + x86mttop 4 + b 2f + x86mttop 5 + b 2f + x86mttop 6 + b 2f + x86mttop 7 +2: + li.w a0, 0 # success + jr ra +SYM_FUNC_END(_restore_ftop_context) + +.L_lbt_fault: + li.w a0, -EFAULT # failure + jr ra diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 4ee1e9d6a65f..91ccedd9db6a 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -82,9 +83,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) euen = regs->csr_euen & ~(CSR_EUEN_FPEN); regs->csr_euen = euen; lose_fpu(0); + lose_lbt(0); clear_thread_flag(TIF_LSX_CTX_LIVE); clear_thread_flag(TIF_LASX_CTX_LIVE); + clear_thread_flag(TIF_LBT_CTX_LIVE); clear_used_math(); regs->csr_era = pc; regs->regs[3] = sp; @@ -121,10 +124,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) preempt_enable(); - if (used_math()) - memcpy(dst, src, sizeof(struct task_struct)); - else + if (!used_math()) memcpy(dst, src, offsetof(struct task_struct, thread.fpu.fpr)); + else + memcpy(dst, src, offsetof(struct task_struct, thread.lbt.scr0)); + +#ifdef CONFIG_CPU_HAS_LBT + memcpy(&dst->thread.lbt, &src->thread.lbt, sizeof(struct loongarch_lbt)); +#endif return 0; } @@ -189,8 +196,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ptrace_hw_copy_thread(p); clear_tsk_thread_flag(p, TIF_USEDFPU); clear_tsk_thread_flag(p, TIF_USEDSIMD); + clear_tsk_thread_flag(p, TIF_USEDLBT); clear_tsk_thread_flag(p, TIF_LSX_CTX_LIVE); clear_tsk_thread_flag(p, TIF_LASX_CTX_LIVE); + clear_tsk_thread_flag(p, TIF_LBT_CTX_LIVE); return 0; } diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c index f72adbf530c6..c114c5ef1332 100644 --- a/arch/loongarch/kernel/ptrace.c +++ b/arch/loongarch/kernel/ptrace.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -338,6 +339,46 @@ static int simd_set(struct task_struct *target, #endif /* CONFIG_CPU_HAS_LSX */ +#ifdef CONFIG_CPU_HAS_LBT +static int lbt_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + int r; + + r = membuf_write(&to, &target->thread.lbt.scr0, sizeof(target->thread.lbt.scr0)); + r = membuf_write(&to, &target->thread.lbt.scr1, sizeof(target->thread.lbt.scr1)); + r = membuf_write(&to, &target->thread.lbt.scr2, sizeof(target->thread.lbt.scr2)); + r = membuf_write(&to, &target->thread.lbt.scr3, sizeof(target->thread.lbt.scr3)); + r = membuf_write(&to, &target->thread.lbt.eflags, sizeof(u32)); + r = membuf_write(&to, &target->thread.fpu.ftop, sizeof(u32)); + + return r; +} + +static int lbt_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int err = 0; + const int eflags_start = 4 * sizeof(target->thread.lbt.scr0); + const int ftop_start = eflags_start + sizeof(u32); + + err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.lbt.scr0, + 0, 4 * sizeof(target->thread.lbt.scr0)); + err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.lbt.eflags, + eflags_start, ftop_start); + err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.ftop, + ftop_start, ftop_start + sizeof(u32)); + + return err; +} +#endif /* CONFIG_CPU_HAS_LBT */ + #ifdef CONFIG_HAVE_HW_BREAKPOINT /* @@ -802,6 +843,9 @@ enum loongarch_regset { #ifdef CONFIG_CPU_HAS_LASX REGSET_LASX, #endif +#ifdef CONFIG_CPU_HAS_LBT + REGSET_LBT, +#endif #ifdef CONFIG_HAVE_HW_BREAKPOINT REGSET_HW_BREAK, REGSET_HW_WATCH, @@ -853,6 +897,16 @@ static const struct user_regset loongarch64_regsets[] = { .set = simd_set, }, #endif +#ifdef CONFIG_CPU_HAS_LBT + [REGSET_LBT] = { + .core_note_type = NT_LOONGARCH_LBT, + .n = 5, + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = lbt_get, + .set = lbt_set, + }, +#endif #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { .core_note_type = NT_LOONGARCH_HW_BREAK, diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c index ceb899366c0a..504fdfe85203 100644 --- a/arch/loongarch/kernel/signal.c +++ b/arch/loongarch/kernel/signal.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,9 @@ /* Make sure we will not lose FPU ownership */ #define lock_fpu_owner() ({ preempt_disable(); pagefault_disable(); }) #define unlock_fpu_owner() ({ pagefault_enable(); preempt_enable(); }) +/* Make sure we will not lose LBT ownership */ +#define lock_lbt_owner() ({ preempt_disable(); pagefault_disable(); }) +#define unlock_lbt_owner() ({ pagefault_enable(); preempt_enable(); }) /* Assembly functions to move context to/from the FPU */ extern asmlinkage int @@ -59,6 +63,13 @@ _save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); extern asmlinkage int _restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); +#ifdef CONFIG_CPU_HAS_LBT +extern asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags); +extern asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags); +extern asmlinkage int _save_ftop_context(void __user *ftop); +extern asmlinkage int _restore_ftop_context(void __user *ftop); +#endif + struct rt_sigframe { struct siginfo rs_info; struct ucontext rs_uctx; @@ -75,6 +86,7 @@ struct extctx_layout { struct _ctx_layout fpu; struct _ctx_layout lsx; struct _ctx_layout lasx; + struct _ctx_layout lbt; struct _ctx_layout end; }; @@ -215,6 +227,52 @@ static int copy_lasx_from_sigcontext(struct lasx_context __user *ctx) return err; } +#ifdef CONFIG_CPU_HAS_LBT +static int copy_lbt_to_sigcontext(struct lbt_context __user *ctx) +{ + int err = 0; + uint64_t __user *regs = (uint64_t *)&ctx->regs; + uint32_t __user *eflags = (uint32_t *)&ctx->eflags; + + err |= __put_user(current->thread.lbt.scr0, ®s[0]); + err |= __put_user(current->thread.lbt.scr1, ®s[1]); + err |= __put_user(current->thread.lbt.scr2, ®s[2]); + err |= __put_user(current->thread.lbt.scr3, ®s[3]); + err |= __put_user(current->thread.lbt.eflags, eflags); + + return err; +} + +static int copy_lbt_from_sigcontext(struct lbt_context __user *ctx) +{ + int err = 0; + uint64_t __user *regs = (uint64_t *)&ctx->regs; + uint32_t __user *eflags = (uint32_t *)&ctx->eflags; + + err |= __get_user(current->thread.lbt.scr0, ®s[0]); + err |= __get_user(current->thread.lbt.scr1, ®s[1]); + err |= __get_user(current->thread.lbt.scr2, ®s[2]); + err |= __get_user(current->thread.lbt.scr3, ®s[3]); + err |= __get_user(current->thread.lbt.eflags, eflags); + + return err; +} + +static int copy_ftop_to_sigcontext(struct lbt_context __user *ctx) +{ + uint32_t __user *ftop = &ctx->ftop; + + return __put_user(current->thread.fpu.ftop, ftop); +} + +static int copy_ftop_from_sigcontext(struct lbt_context __user *ctx) +{ + uint32_t __user *ftop = &ctx->ftop; + + return __get_user(current->thread.fpu.ftop, ftop); +} +#endif + /* * Wrappers for the assembly _{save,restore}_fp_context functions. */ @@ -272,6 +330,41 @@ static int restore_hw_lasx_context(struct lasx_context __user *ctx) return _restore_lasx_context(regs, fcc, fcsr); } +/* + * Wrappers for the assembly _{save,restore}_lbt_context functions. + */ +#ifdef CONFIG_CPU_HAS_LBT +static int save_hw_lbt_context(struct lbt_context __user *ctx) +{ + uint64_t __user *regs = (uint64_t *)&ctx->regs; + uint32_t __user *eflags = (uint32_t *)&ctx->eflags; + + return _save_lbt_context(regs, eflags); +} + +static int restore_hw_lbt_context(struct lbt_context __user *ctx) +{ + uint64_t __user *regs = (uint64_t *)&ctx->regs; + uint32_t __user *eflags = (uint32_t *)&ctx->eflags; + + return _restore_lbt_context(regs, eflags); +} + +static int save_hw_ftop_context(struct lbt_context __user *ctx) +{ + uint32_t __user *ftop = &ctx->ftop; + + return _save_ftop_context(ftop); +} + +static int restore_hw_ftop_context(struct lbt_context __user *ctx) +{ + uint32_t __user *ftop = &ctx->ftop; + + return _restore_ftop_context(ftop); +} +#endif + static int fcsr_pending(unsigned int __user *fcsr) { int err, sig = 0; @@ -519,6 +612,77 @@ static int protected_restore_lasx_context(struct extctx_layout *extctx) return err ?: sig; } +#ifdef CONFIG_CPU_HAS_LBT +static int protected_save_lbt_context(struct extctx_layout *extctx) +{ + int err = 0; + struct sctx_info __user *info = extctx->lbt.addr; + struct lbt_context __user *lbt_ctx = + (struct lbt_context *)get_ctx_through_ctxinfo(info); + uint64_t __user *regs = (uint64_t *)&lbt_ctx->regs; + uint32_t __user *eflags = (uint32_t *)&lbt_ctx->eflags; + + while (1) { + lock_lbt_owner(); + if (is_lbt_owner()) + err |= save_hw_lbt_context(lbt_ctx); + else + err |= copy_lbt_to_sigcontext(lbt_ctx); + if (is_fpu_owner()) + err |= save_hw_ftop_context(lbt_ctx); + else + err |= copy_ftop_to_sigcontext(lbt_ctx); + unlock_lbt_owner(); + + err |= __put_user(LBT_CTX_MAGIC, &info->magic); + err |= __put_user(extctx->lbt.size, &info->size); + + if (likely(!err)) + break; + /* Touch the LBT context and try again */ + err = __put_user(0, ®s[0]) | __put_user(0, eflags); + + if (err) + return err; + } + + return err; +} + +static int protected_restore_lbt_context(struct extctx_layout *extctx) +{ + int err = 0, tmp __maybe_unused; + struct sctx_info __user *info = extctx->lbt.addr; + struct lbt_context __user *lbt_ctx = + (struct lbt_context *)get_ctx_through_ctxinfo(info); + uint64_t __user *regs = (uint64_t *)&lbt_ctx->regs; + uint32_t __user *eflags = (uint32_t *)&lbt_ctx->eflags; + + while (1) { + lock_lbt_owner(); + if (is_lbt_owner()) + err |= restore_hw_lbt_context(lbt_ctx); + else + err |= copy_lbt_from_sigcontext(lbt_ctx); + if (is_fpu_owner()) + err |= restore_hw_ftop_context(lbt_ctx); + else + err |= copy_ftop_from_sigcontext(lbt_ctx); + unlock_lbt_owner(); + + if (likely(!err)) + break; + /* Touch the LBT context and try again */ + err = __get_user(tmp, ®s[0]) | __get_user(tmp, eflags); + + if (err) + return err; + } + + return err; +} +#endif + static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, struct extctx_layout *extctx) { @@ -539,6 +703,11 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, else if (extctx->fpu.addr) err |= protected_save_fpu_context(extctx); +#ifdef CONFIG_CPU_HAS_LBT + if (extctx->lbt.addr) + err |= protected_save_lbt_context(extctx); +#endif + /* Set the "end" magic */ info = (struct sctx_info *)extctx->end.addr; err |= __put_user(0, &info->magic); @@ -584,6 +753,13 @@ static int parse_extcontext(struct sigcontext __user *sc, struct extctx_layout * extctx->lasx.addr = info; break; + case LBT_CTX_MAGIC: + if (size < (sizeof(struct sctx_info) + + sizeof(struct lbt_context))) + goto invalid; + extctx->lbt.addr = info; + break; + default: goto invalid; } @@ -636,6 +812,11 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc else if (extctx.fpu.addr) err |= protected_restore_fpu_context(&extctx); +#ifdef CONFIG_CPU_HAS_LBT + if (extctx.lbt.addr) + err |= protected_restore_lbt_context(&extctx); +#endif + bad: return err; } @@ -700,6 +881,13 @@ static unsigned long setup_extcontext(struct extctx_layout *extctx, unsigned lon sizeof(struct fpu_context), FPU_CTX_ALIGN, new_sp); } +#ifdef CONFIG_CPU_HAS_LBT + if (cpu_has_lbt && thread_lbt_context_live()) { + new_sp = extframe_alloc(extctx, &extctx->lbt, + sizeof(struct lbt_context), LBT_CTX_ALIGN, new_sp); + } +#endif + return new_sp; } diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 89699db45cec..7ed01a7b2542 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -966,13 +967,47 @@ asmlinkage void noinstr do_lasx(struct pt_regs *regs) irqentry_exit(regs, state); } +static void init_restore_lbt(void) +{ + if (!thread_lbt_context_live()) { + /* First time LBT context user */ + init_lbt(); + set_thread_flag(TIF_LBT_CTX_LIVE); + } else { + if (!is_lbt_owner()) + own_lbt_inatomic(1); + } + + BUG_ON(!is_lbt_enabled()); +} + asmlinkage void noinstr do_lbt(struct pt_regs *regs) { irqentry_state_t state = irqentry_enter(regs); - local_irq_enable(); - force_sig(SIGILL); - local_irq_disable(); + /* + * BTD (Binary Translation Disable exception) can be triggered + * during FP save/restore if TM (Top Mode) is on, which may + * cause irq_enable during 'switch_to'. To avoid this situation + * (including the user using 'MOVGR2GCSR' to turn on TM, which + * will not trigger the BTE), we need to check PRMD first. + */ + if (regs->csr_prmd & CSR_PRMD_PIE) + local_irq_enable(); + + if (!cpu_has_lbt) { + force_sig(SIGILL); + goto out; + } + BUG_ON(is_lbt_enabled()); + + preempt_disable(); + init_restore_lbt(); + preempt_enable(); + +out: + if (regs->csr_prmd & CSR_PRMD_PIE) + local_irq_disable(); irqentry_exit(regs, state); } From e14dd076964ef11e9d6e3b06a2f1c6bb7d034133 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 14/25] LoongArch: Add basic KGDB & KDB support KGDB is intended to be used as a source level debugger for the Linux kernel. It is used along with gdb to debug a Linux kernel. GDB can be used to "break in" to the kernel to inspect memory, variables and regs similar to the way an application developer would use GDB to debug an application. KDB is a frontend of KGDB which is similar to GDB. By now, in addition to the generic KGDB features, the LoongArch KGDB implements the following features: - Hardware breakpoints/watchpoints; - Software single-step support for KDB. Signed-off-by: Qing Zhang # Framework & CoreFeature Signed-off-by: Binbin Zhou # BreakPoint & SingleStep Signed-off-by: Hui Li # Some Minor Improvements Signed-off-by: Randy Dunlap # Some Build Error Fixes Signed-off-by: Huacai Chen --- .../features/debug/kgdb/arch-support.txt | 2 +- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/kgdb.h | 97 +++ arch/loongarch/include/asm/stackframe.h | 4 + arch/loongarch/kernel/Makefile | 1 + arch/loongarch/kernel/entry.S | 5 + arch/loongarch/kernel/kgdb.c | 727 ++++++++++++++++++ arch/loongarch/kernel/traps.c | 9 + 8 files changed, 845 insertions(+), 1 deletion(-) create mode 100644 arch/loongarch/include/asm/kgdb.h create mode 100644 arch/loongarch/kernel/kgdb.c diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt index 958498f9f2a4..5e91ec78c80b 100644 --- a/Documentation/features/debug/kgdb/arch-support.txt +++ b/Documentation/features/debug/kgdb/arch-support.txt @@ -13,7 +13,7 @@ | csky: | TODO | | hexagon: | ok | | ia64: | TODO | - | loongarch: | TODO | + | loongarch: | ok | | m68k: | TODO | | microblaze: | ok | | mips: | ok | diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 21cc2e2d1f27..b02e399b1cd7 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -91,6 +91,7 @@ config LOONGARCH select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE + select HAVE_ARCH_KGDB if PERF_EVENTS select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK diff --git a/arch/loongarch/include/asm/kgdb.h b/arch/loongarch/include/asm/kgdb.h new file mode 100644 index 000000000000..2041ae58b161 --- /dev/null +++ b/arch/loongarch/include/asm/kgdb.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#ifndef _ASM_LOONGARCH_KGDB_H +#define _ASM_LOONGARCH_KGDB_H + +#define GDB_SIZEOF_REG sizeof(u64) + +/* gdb remote procotol expects the following register layout. */ + +/* + * General purpose registers: + * r0-r31: 64 bit + * orig_a0: 64 bit + * pc : 64 bit + * csr_badvaddr: 64 bit + */ +#define DBG_PT_REGS_BASE 0 +#define DBG_PT_REGS_NUM 35 +#define DBG_PT_REGS_END (DBG_PT_REGS_BASE + DBG_PT_REGS_NUM - 1) + +/* + * Floating point registers: + * f0-f31: 64 bit + */ +#define DBG_FPR_BASE (DBG_PT_REGS_END + 1) +#define DBG_FPR_NUM 32 +#define DBG_FPR_END (DBG_FPR_BASE + DBG_FPR_NUM - 1) + +/* + * Condition Flag registers: + * fcc0-fcc8: 8 bit + */ +#define DBG_FCC_BASE (DBG_FPR_END + 1) +#define DBG_FCC_NUM 8 +#define DBG_FCC_END (DBG_FCC_BASE + DBG_FCC_NUM - 1) + +/* + * Floating-point Control and Status registers: + * fcsr: 32 bit + */ +#define DBG_FCSR_NUM 1 +#define DBG_FCSR (DBG_FCC_END + 1) + +#define DBG_MAX_REG_NUM (DBG_FCSR + 1) + +/* + * Size of I/O buffer for gdb packet. + * considering to hold all register contents, size is set + */ +#define BUFMAX 2048 + +/* + * Number of bytes required for gdb_regs buffer. + * PT_REGS and FPR: 8 bytes; FCSR: 4 bytes; FCC: 1 bytes. + * GDB fails to connect for size beyond this with error + * "'g' packet reply is too long" + */ +#define NUMREGBYTES ((DBG_PT_REGS_NUM + DBG_FPR_NUM) * GDB_SIZEOF_REG + DBG_FCC_NUM * 1 + DBG_FCSR_NUM * 4) + +#define BREAK_INSTR_SIZE 4 +#define CACHE_FLUSH_IS_SAFE 0 + +/* Register numbers of various important registers. */ +enum dbg_loongarch_regnum { + DBG_LOONGARCH_ZERO = 0, + DBG_LOONGARCH_RA, + DBG_LOONGARCH_TP, + DBG_LOONGARCH_SP, + DBG_LOONGARCH_A0, + DBG_LOONGARCH_FP = 22, + DBG_LOONGARCH_S0, + DBG_LOONGARCH_S1, + DBG_LOONGARCH_S2, + DBG_LOONGARCH_S3, + DBG_LOONGARCH_S4, + DBG_LOONGARCH_S5, + DBG_LOONGARCH_S6, + DBG_LOONGARCH_S7, + DBG_LOONGARCH_S8, + DBG_LOONGARCH_ORIG_A0, + DBG_LOONGARCH_PC, + DBG_LOONGARCH_BADV +}; + +void kgdb_breakinst(void); +void arch_kgdb_breakpoint(void); + +#ifdef CONFIG_KGDB +bool kgdb_breakpoint_handler(struct pt_regs *regs); +#else /* !CONFIG_KGDB */ +static inline bool kgdb_breakpoint_handler(struct pt_regs *regs) { return false; } +#endif /* CONFIG_KGDB */ + +#endif /* __ASM_KGDB_H_ */ diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 7df80e6ae9d2..4fb1e6408b98 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -158,6 +158,10 @@ cfi_st u0, PT_R21, \docfi csrrd u0, PERCPU_BASE_KS 9: +#ifdef CONFIG_KGDB + li.w t0, CSR_CRMD_WE + csrxchg t0, t0, LOONGARCH_CSR_CRMD +#endif .endm .macro SAVE_ALL docfi=0 diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 5341dcf15a17..c0b88528cb81 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_regs.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_RETHOOK) += rethook.o rethook_trampoline.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S index d737e3cf42d3..65518bb8f472 100644 --- a/arch/loongarch/kernel/entry.S +++ b/arch/loongarch/kernel/entry.S @@ -58,6 +58,11 @@ SYM_FUNC_START(handle_syscall) SAVE_STATIC +#ifdef CONFIG_KGDB + li.w t1, CSR_CRMD_WE + csrxchg t1, t1, LOONGARCH_CSR_CRMD +#endif + move u0, t0 li.d tp, ~_THREAD_MASK and tp, tp, sp diff --git a/arch/loongarch/kernel/kgdb.c b/arch/loongarch/kernel/kgdb.c new file mode 100644 index 000000000000..445c452d72a7 --- /dev/null +++ b/arch/loongarch/kernel/kgdb.c @@ -0,0 +1,727 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * LoongArch KGDB support + * + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int kgdb_watch_activated; +static unsigned int stepped_opcode; +static unsigned long stepped_address; + +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { + { "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[0]) }, + { "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[1]) }, + { "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[2]) }, + { "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[3]) }, + { "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[4]) }, + { "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[5]) }, + { "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[6]) }, + { "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[7]) }, + { "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[8]) }, + { "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[9]) }, + { "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[10]) }, + { "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[11]) }, + { "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[12]) }, + { "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[13]) }, + { "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[14]) }, + { "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[15]) }, + { "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[16]) }, + { "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[17]) }, + { "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[18]) }, + { "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[19]) }, + { "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[20]) }, + { "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[21]) }, + { "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[22]) }, + { "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[23]) }, + { "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[24]) }, + { "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[25]) }, + { "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[26]) }, + { "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[27]) }, + { "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[28]) }, + { "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[29]) }, + { "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[30]) }, + { "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[31]) }, + { "orig_a0", GDB_SIZEOF_REG, offsetof(struct pt_regs, orig_a0) }, + { "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, csr_era) }, + { "badv", GDB_SIZEOF_REG, offsetof(struct pt_regs, csr_badvaddr) }, + { "f0", GDB_SIZEOF_REG, 0 }, + { "f1", GDB_SIZEOF_REG, 1 }, + { "f2", GDB_SIZEOF_REG, 2 }, + { "f3", GDB_SIZEOF_REG, 3 }, + { "f4", GDB_SIZEOF_REG, 4 }, + { "f5", GDB_SIZEOF_REG, 5 }, + { "f6", GDB_SIZEOF_REG, 6 }, + { "f7", GDB_SIZEOF_REG, 7 }, + { "f8", GDB_SIZEOF_REG, 8 }, + { "f9", GDB_SIZEOF_REG, 9 }, + { "f10", GDB_SIZEOF_REG, 10 }, + { "f11", GDB_SIZEOF_REG, 11 }, + { "f12", GDB_SIZEOF_REG, 12 }, + { "f13", GDB_SIZEOF_REG, 13 }, + { "f14", GDB_SIZEOF_REG, 14 }, + { "f15", GDB_SIZEOF_REG, 15 }, + { "f16", GDB_SIZEOF_REG, 16 }, + { "f17", GDB_SIZEOF_REG, 17 }, + { "f18", GDB_SIZEOF_REG, 18 }, + { "f19", GDB_SIZEOF_REG, 19 }, + { "f20", GDB_SIZEOF_REG, 20 }, + { "f21", GDB_SIZEOF_REG, 21 }, + { "f22", GDB_SIZEOF_REG, 22 }, + { "f23", GDB_SIZEOF_REG, 23 }, + { "f24", GDB_SIZEOF_REG, 24 }, + { "f25", GDB_SIZEOF_REG, 25 }, + { "f26", GDB_SIZEOF_REG, 26 }, + { "f27", GDB_SIZEOF_REG, 27 }, + { "f28", GDB_SIZEOF_REG, 28 }, + { "f29", GDB_SIZEOF_REG, 29 }, + { "f30", GDB_SIZEOF_REG, 30 }, + { "f31", GDB_SIZEOF_REG, 31 }, + { "fcc0", 1, 0 }, + { "fcc1", 1, 1 }, + { "fcc2", 1, 2 }, + { "fcc3", 1, 3 }, + { "fcc4", 1, 4 }, + { "fcc5", 1, 5 }, + { "fcc6", 1, 6 }, + { "fcc7", 1, 7 }, + { "fcsr", 4, 0 }, +}; + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + int reg_offset, reg_size; + + if (regno < 0 || regno >= DBG_MAX_REG_NUM) + return NULL; + + reg_offset = dbg_reg_def[regno].offset; + reg_size = dbg_reg_def[regno].size; + + if (reg_offset == -1) + goto out; + + /* Handle general-purpose/orig_a0/pc/badv registers */ + if (regno <= DBG_PT_REGS_END) { + memcpy(mem, (void *)regs + reg_offset, reg_size); + goto out; + } + + if (!(regs->csr_euen & CSR_EUEN_FPEN)) + goto out; + + save_fp(current); + + /* Handle FP registers */ + switch (regno) { + case DBG_FCSR: /* Process the fcsr */ + memcpy(mem, (void *)¤t->thread.fpu.fcsr, reg_size); + break; + case DBG_FCC_BASE ... DBG_FCC_END: /* Process the fcc */ + memcpy(mem, (void *)¤t->thread.fpu.fcc + reg_offset, reg_size); + break; + case DBG_FPR_BASE ... DBG_FPR_END: /* Process the fpr */ + memcpy(mem, (void *)¤t->thread.fpu.fpr[reg_offset], reg_size); + break; + default: + break; + } + +out: + return dbg_reg_def[regno].name; +} + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + int reg_offset, reg_size; + + if (regno < 0 || regno >= DBG_MAX_REG_NUM) + return -EINVAL; + + reg_offset = dbg_reg_def[regno].offset; + reg_size = dbg_reg_def[regno].size; + + if (reg_offset == -1) + return 0; + + /* Handle general-purpose/orig_a0/pc/badv registers */ + if (regno <= DBG_PT_REGS_END) { + memcpy((void *)regs + reg_offset, mem, reg_size); + return 0; + } + + if (!(regs->csr_euen & CSR_EUEN_FPEN)) + return 0; + + /* Handle FP registers */ + switch (regno) { + case DBG_FCSR: /* Process the fcsr */ + memcpy((void *)¤t->thread.fpu.fcsr, mem, reg_size); + break; + case DBG_FCC_BASE ... DBG_FCC_END: /* Process the fcc */ + memcpy((void *)¤t->thread.fpu.fcc + reg_offset, mem, reg_size); + break; + case DBG_FPR_BASE ... DBG_FPR_END: /* Process the fpr */ + memcpy((void *)¤t->thread.fpu.fpr[reg_offset], mem, reg_size); + break; + default: + break; + } + + restore_fp(current); + + return 0; +} + +/* + * Similar to regs_to_gdb_regs() except that process is sleeping and so + * we may not be able to get all the info. + */ +void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) +{ + /* Initialize to zero */ + memset((char *)gdb_regs, 0, NUMREGBYTES); + + gdb_regs[DBG_LOONGARCH_RA] = p->thread.reg01; + gdb_regs[DBG_LOONGARCH_TP] = (long)p; + gdb_regs[DBG_LOONGARCH_SP] = p->thread.reg03; + + /* S0 - S8 */ + gdb_regs[DBG_LOONGARCH_S0] = p->thread.reg23; + gdb_regs[DBG_LOONGARCH_S1] = p->thread.reg24; + gdb_regs[DBG_LOONGARCH_S2] = p->thread.reg25; + gdb_regs[DBG_LOONGARCH_S3] = p->thread.reg26; + gdb_regs[DBG_LOONGARCH_S4] = p->thread.reg27; + gdb_regs[DBG_LOONGARCH_S5] = p->thread.reg28; + gdb_regs[DBG_LOONGARCH_S6] = p->thread.reg29; + gdb_regs[DBG_LOONGARCH_S7] = p->thread.reg30; + gdb_regs[DBG_LOONGARCH_S8] = p->thread.reg31; + + /* + * PC use return address (RA), i.e. the moment after return from __switch_to() + */ + gdb_regs[DBG_LOONGARCH_PC] = p->thread.reg01; +} + +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc) +{ + regs->csr_era = pc; +} + +void arch_kgdb_breakpoint(void) +{ + __asm__ __volatile__ ( \ + ".globl kgdb_breakinst\n\t" \ + "nop\n" \ + "kgdb_breakinst:\tbreak 2\n\t"); /* BRK_KDB = 2 */ +} + +/* + * Calls linux_debug_hook before the kernel dies. If KGDB is enabled, + * then try to fall into the debugger + */ +static int kgdb_loongarch_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + struct die_args *args = (struct die_args *)ptr; + struct pt_regs *regs = args->regs; + + /* Userspace events, ignore. */ + if (user_mode(regs)) + return NOTIFY_DONE; + + if (!kgdb_io_module_registered) + return NOTIFY_DONE; + + if (atomic_read(&kgdb_active) != -1) + kgdb_nmicallback(smp_processor_id(), regs); + + if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) + return NOTIFY_DONE; + + if (atomic_read(&kgdb_setting_breakpoint)) + if (regs->csr_era == (unsigned long)&kgdb_breakinst) + regs->csr_era += LOONGARCH_INSN_SIZE; + + return NOTIFY_STOP; +} + +bool kgdb_breakpoint_handler(struct pt_regs *regs) +{ + struct die_args args = { + .regs = regs, + .str = "Break", + .err = BRK_KDB, + .trapnr = read_csr_excode(), + .signr = SIGTRAP, + + }; + + return (kgdb_loongarch_notify(NULL, DIE_TRAP, &args) == NOTIFY_STOP) ? true : false; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_loongarch_notify, +}; + +static inline void kgdb_arch_update_addr(struct pt_regs *regs, + char *remcom_in_buffer) +{ + unsigned long addr; + char *ptr; + + ptr = &remcom_in_buffer[1]; + if (kgdb_hex2long(&ptr, &addr)) + regs->csr_era = addr; +} + +/* Calculate the new address for after a step */ +static int get_step_address(struct pt_regs *regs, unsigned long *next_addr) +{ + char cj_val; + unsigned int si, si_l, si_h, rd, rj, cj; + unsigned long pc = instruction_pointer(regs); + union loongarch_instruction *ip = (union loongarch_instruction *)pc; + + if (pc & 3) { + pr_warn("%s: invalid pc 0x%lx\n", __func__, pc); + return -EINVAL; + } + + *next_addr = pc + LOONGARCH_INSN_SIZE; + + si_h = ip->reg0i26_format.immediate_h; + si_l = ip->reg0i26_format.immediate_l; + switch (ip->reg0i26_format.opcode) { + case b_op: + *next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 27); + return 0; + case bl_op: + *next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 27); + regs->regs[1] = pc + LOONGARCH_INSN_SIZE; + return 0; + } + + rj = ip->reg1i21_format.rj; + cj = (rj & 0x07) + DBG_FCC_BASE; + si_l = ip->reg1i21_format.immediate_l; + si_h = ip->reg1i21_format.immediate_h; + dbg_get_reg(cj, &cj_val, regs); + switch (ip->reg1i21_format.opcode) { + case beqz_op: + if (regs->regs[rj] == 0) + *next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22); + return 0; + case bnez_op: + if (regs->regs[rj] != 0) + *next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22); + return 0; + case bceqz_op: /* bceqz_op = bcnez_op */ + if (((rj & 0x18) == 0x00) && !cj_val) /* bceqz */ + *next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22); + if (((rj & 0x18) == 0x08) && cj_val) /* bcnez */ + *next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22); + return 0; + } + + rj = ip->reg2i16_format.rj; + rd = ip->reg2i16_format.rd; + si = ip->reg2i16_format.immediate; + switch (ip->reg2i16_format.opcode) { + case beq_op: + if (regs->regs[rj] == regs->regs[rd]) + *next_addr = pc + sign_extend64(si << 2, 17); + return 0; + case bne_op: + if (regs->regs[rj] != regs->regs[rd]) + *next_addr = pc + sign_extend64(si << 2, 17); + return 0; + case blt_op: + if ((long)regs->regs[rj] < (long)regs->regs[rd]) + *next_addr = pc + sign_extend64(si << 2, 17); + return 0; + case bge_op: + if ((long)regs->regs[rj] >= (long)regs->regs[rd]) + *next_addr = pc + sign_extend64(si << 2, 17); + return 0; + case bltu_op: + if (regs->regs[rj] < regs->regs[rd]) + *next_addr = pc + sign_extend64(si << 2, 17); + return 0; + case bgeu_op: + if (regs->regs[rj] >= regs->regs[rd]) + *next_addr = pc + sign_extend64(si << 2, 17); + return 0; + case jirl_op: + regs->regs[rd] = pc + LOONGARCH_INSN_SIZE; + *next_addr = regs->regs[rj] + sign_extend64(si << 2, 17); + return 0; + } + + return 0; +} + +static int do_single_step(struct pt_regs *regs) +{ + int error = 0; + unsigned long addr = 0; /* Determine where the target instruction will send us to */ + + error = get_step_address(regs, &addr); + if (error) + return error; + + /* Store the opcode in the stepped address */ + error = get_kernel_nofault(stepped_opcode, (void *)addr); + if (error) + return error; + + stepped_address = addr; + + /* Replace the opcode with the break instruction */ + error = copy_to_kernel_nofault((void *)stepped_address, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); + + if (error) { + stepped_opcode = 0; + stepped_address = 0; + } else { + kgdb_single_step = 1; + atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); + } + + return error; +} + +/* Undo a single step */ +static void undo_single_step(struct pt_regs *regs) +{ + if (stepped_opcode) { + copy_to_kernel_nofault((void *)stepped_address, + (void *)&stepped_opcode, BREAK_INSTR_SIZE); + flush_icache_range(stepped_address, stepped_address + BREAK_INSTR_SIZE); + } + + stepped_opcode = 0; + stepped_address = 0; + kgdb_single_step = 0; + atomic_set(&kgdb_cpu_doing_single_step, -1); +} + +int kgdb_arch_handle_exception(int vector, int signo, int err_code, + char *remcom_in_buffer, char *remcom_out_buffer, + struct pt_regs *regs) +{ + int ret = 0; + + undo_single_step(regs); + regs->csr_prmd |= CSR_PRMD_PWE; + + switch (remcom_in_buffer[0]) { + case 'D': + case 'k': + regs->csr_prmd &= ~CSR_PRMD_PWE; + fallthrough; + case 'c': + kgdb_arch_update_addr(regs, remcom_in_buffer); + break; + case 's': + kgdb_arch_update_addr(regs, remcom_in_buffer); + ret = do_single_step(regs); + break; + default: + ret = -1; + } + + return ret; +} + +static struct hw_breakpoint { + unsigned int enabled; + unsigned long addr; + int len; + int type; + struct perf_event * __percpu *pev; +} breakinfo[LOONGARCH_MAX_BRP]; + +static int hw_break_reserve_slot(int breakno) +{ + int cpu, cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + int cpu; + struct perf_event **pevent; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responsible for handing the retry on + * remove failure. + */ + return -1; + } + + return 0; +} + +static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) + if (!breakinfo[i].enabled) + break; + + if (i == LOONGARCH_MAX_BRP) + return -1; + + switch (bptype) { + case BP_HARDWARE_BREAKPOINT: + breakinfo[i].type = HW_BREAKPOINT_X; + break; + case BP_READ_WATCHPOINT: + breakinfo[i].type = HW_BREAKPOINT_R; + break; + case BP_WRITE_WATCHPOINT: + breakinfo[i].type = HW_BREAKPOINT_W; + break; + case BP_ACCESS_WATCHPOINT: + breakinfo[i].type = HW_BREAKPOINT_RW; + break; + default: + return -1; + } + + switch (len) { + case 1: + breakinfo[i].len = HW_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = HW_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = HW_BREAKPOINT_LEN_4; + break; + case 8: + breakinfo[i].len = HW_BREAKPOINT_LEN_8; + break; + default: + return -1; + } + + breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } + breakinfo[i].enabled = 1; + + return 0; +} + +static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) + if (breakinfo[i].addr == addr && breakinfo[i].enabled) + break; + + if (i == LOONGARCH_MAX_BRP) + return -1; + + if (hw_break_release_slot(i)) { + pr_err("Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } + breakinfo[i].enabled = 0; + + return 0; +} + +static void kgdb_disable_hw_break(struct pt_regs *regs) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) { + if (!breakinfo[i].enabled) + continue; + + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } + + /* Disable hardware debugging while we are in kgdb */ + csr_xchg32(0, CSR_CRMD_WE, LOONGARCH_CSR_CRMD); +} + +static void kgdb_remove_all_hw_break(void) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) { + if (!breakinfo[i].enabled) + continue; + + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + continue; + } + + if (hw_break_release_slot(i)) + pr_err("KGDB: hw bpt remove failed %lx\n", breakinfo[i].addr); + breakinfo[i].enabled = 0; + } + + csr_xchg32(0, CSR_CRMD_WE, LOONGARCH_CSR_CRMD); + kgdb_watch_activated = 0; +} + +static void kgdb_correct_hw_break(void) +{ + int i, activated = 0; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) { + struct perf_event *bp; + int val; + int cpu = raw_smp_processor_id(); + + if (!breakinfo[i].enabled) + continue; + + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled != 1) + continue; + + bp->attr.bp_addr = breakinfo[i].addr; + bp->attr.bp_len = breakinfo[i].len; + bp->attr.bp_type = breakinfo[i].type; + + val = hw_breakpoint_arch_parse(bp, &bp->attr, counter_arch_bp(bp)); + if (val) + return; + + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; + activated = 1; + } + + csr_xchg32(activated ? CSR_CRMD_WE : 0, CSR_CRMD_WE, LOONGARCH_CSR_CRMD); + kgdb_watch_activated = activated; +} + +const struct kgdb_arch arch_kgdb_ops = { + .gdb_bpt_instr = {0x02, 0x00, break_op >> 1, 0x00}, /* BRK_KDB = 2 */ + .flags = KGDB_HW_BREAKPOINT, + .set_hw_breakpoint = kgdb_set_hw_break, + .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_break, + .remove_all_hw_break = kgdb_remove_all_hw_break, + .correct_hw_break = kgdb_correct_hw_break, +}; + +int kgdb_arch_init(void) +{ + return register_die_notifier(&kgdb_notifier); +} + +void kgdb_arch_late(void) +{ + int i, cpu; + struct perf_event_attr attr; + struct perf_event **pevent; + + hw_breakpoint_init(&attr); + + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.bp_len = HW_BREAKPOINT_LEN_4; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) { + if (breakinfo[i].pev) + continue; + + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); + if (IS_ERR((void * __force)breakinfo[i].pev)) { + pr_err("kgdb: Could not allocate hw breakpoints.\n"); + breakinfo[i].pev = NULL; + return; + } + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + if (pevent[0]->destroy) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } +} + +void kgdb_arch_exit(void) +{ + int i; + + for (i = 0; i < LOONGARCH_MAX_BRP; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } + + unregister_die_notifier(&kgdb_notifier); +} diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 7ed01a7b2542..65214774ef7c 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -703,6 +704,11 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) * pertain to them. */ switch (bcode) { + case BRK_KDB: + if (kgdb_breakpoint_handler(regs)) + goto out; + else + break; case BRK_KPROBE_BP: if (kprobe_breakpoint_handler(regs)) goto out; @@ -769,6 +775,9 @@ asmlinkage void noinstr do_watch(struct pt_regs *regs) #ifndef CONFIG_HAVE_HW_BREAKPOINT pr_warn("Hardware watch point handler not implemented!\n"); #else + if (kgdb_breakpoint_handler(regs)) + goto out; + if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) { int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1); unsigned long pc = instruction_pointer(regs); From b72961f847c0f0df113ae2d6ac9fd6b1e6bdeaf2 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 15/25] LoongArch: Provide kaslr_offset() to get kernel offset Provide kaslr_offset() to get the kernel offset when KASLR is enabled. Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/setup.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h index be05c0e706a2..7c68b4365a4d 100644 --- a/arch/loongarch/include/asm/setup.h +++ b/arch/loongarch/include/asm/setup.h @@ -7,6 +7,7 @@ #define _LOONGARCH_SETUP_H #include +#include #include #define VECSIZE 0x200 @@ -37,4 +38,9 @@ extern void * __init relocate_kernel(void); #endif +static inline unsigned long kaslr_offset(void) +{ + return (unsigned long)&_text - VMLINUX_LOAD_ADDRESS; +} + #endif /* __SETUP_H */ From 2363088eba2ecccfb643725e4864af73c4226a04 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 16/25] LoongArch: Allow building with kcov coverage Add ARCH_HAS_KCOV and HAVE_GCC_PLUGINS to the LoongArch Kconfig. And also disable instrumentation of vdso. Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen --- Documentation/features/debug/kcov/arch-support.txt | 2 +- arch/loongarch/Kconfig | 2 ++ arch/loongarch/vdso/Makefile | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/features/debug/kcov/arch-support.txt b/Documentation/features/debug/kcov/arch-support.txt index ffcc9f2b1d74..de84cefbcdd3 100644 --- a/Documentation/features/debug/kcov/arch-support.txt +++ b/Documentation/features/debug/kcov/arch-support.txt @@ -13,7 +13,7 @@ | csky: | TODO | | hexagon: | TODO | | ia64: | TODO | - | loongarch: | TODO | + | loongarch: | ok | | m68k: | TODO | | microblaze: | TODO | | mips: | ok | diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index b02e399b1cd7..6cda4843f93b 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -13,6 +13,7 @@ config LOONGARCH select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_KCOV select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_SPECIAL @@ -116,6 +117,7 @@ config LOONGARCH select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER + select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IOREMAP_PROT diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index a50308b6fc25..d8b75f07c869 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 # Objects to go into the VDSO. +KCOV_INSTRUMENT := n + # Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile From ec9fee79d48f2f05cb1b95dc901071aa3670f228 Mon Sep 17 00:00:00 2001 From: Enze Li Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 17/25] kfence: Defer the assignment of the local variable addr The LoongArch architecture is different from other architectures. It needs to update __kfence_pool during arch_kfence_init_pool(). This patch modifies the assignment location of the local variable addr in the kfence_init_pool() function to support the case of updating __kfence_pool in arch_kfence_init_pool(). Acked-by: Marco Elver Signed-off-by: Enze Li Signed-off-by: Huacai Chen --- mm/kfence/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index dad3c0eb70a0..e124ffff489f 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -566,13 +566,14 @@ static void rcu_guarded_free(struct rcu_head *h) */ static unsigned long kfence_init_pool(void) { - unsigned long addr = (unsigned long)__kfence_pool; + unsigned long addr; struct page *pages; int i; if (!arch_kfence_init_pool()) - return addr; + return (unsigned long)__kfence_pool; + addr = (unsigned long)__kfence_pool; pages = virt_to_page(__kfence_pool); /* From 8b5cb1cbf33292372933e727805e68b506852234 Mon Sep 17 00:00:00 2001 From: Enze Li Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: [PATCH 18/25] LoongArch: mm: Add page table mapped mode support for virt_to_page() According to LoongArch documentations, there are two types of address translation modes: direct mapped address translation mode (DMW mode) and page table mapped address translation mode (TLB mode). Currently, virt_to_page() only supports direct mapped mode. This patch determines which mode is used, and adds corresponding handling functions for both modes. For more details on the two modes, see [1]. [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#virtual-address-space-and-address-translation-mode Signed-off-by: Enze Li Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/page.h | 7 ++++++- arch/loongarch/include/asm/pgtable.h | 3 +++ arch/loongarch/mm/pgtable.c | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 26e8dccb6619..63f137ce82a4 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -84,7 +84,12 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) #define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) -#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) + +#define virt_to_page(kaddr) \ +({ \ + (likely((unsigned long)kaddr < vm_map_base)) ? \ + dmw_virt_to_page((unsigned long)kaddr) : tlb_virt_to_page((unsigned long)kaddr);\ +}) extern int __virt_addr_valid(volatile void *kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr)) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 7699af049443..338d1b147464 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -353,6 +353,9 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt extern pgd_t swapper_pg_dir[]; extern pgd_t invalid_pg_dir[]; +struct page *dmw_virt_to_page(unsigned long kaddr); +struct page *tlb_virt_to_page(unsigned long kaddr); + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 36a6dc0148ae..482923251824 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -9,6 +9,18 @@ #include #include +struct page *dmw_virt_to_page(unsigned long kaddr) +{ + return pfn_to_page(virt_to_pfn(kaddr)); +} +EXPORT_SYMBOL_GPL(dmw_virt_to_page); + +struct page *tlb_virt_to_page(unsigned long kaddr) +{ + return pfn_to_page(pte_pfn(*virt_to_kpte(kaddr))); +} +EXPORT_SYMBOL_GPL(tlb_virt_to_page); + pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; From 95bb5b617beec0275bcc47b68796b34852fe4ecb Mon Sep 17 00:00:00 2001 From: Enze Li Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: [PATCH 19/25] LoongArch: Get partial stack information when providing regs parameter Currently, arch_stack_walk() can only get the full stack information including NMI. This is because the implementation of arch_stack_walk() is forced to ignore the information passed by the regs parameter and use the current stack information instead. For some detection systems like KFENCE, only partial stack information is needed. In particular, the stack frame where the interrupt occurred. To support KFENCE, this patch modifies the implementation of the arch_stack_walk() function so that if this function is called with the regs argument passed, it retains all the stack information in regs and uses it to provide accurate information. Before this patch: [ 1.531195 ] ================================================================== [ 1.531442 ] BUG: KFENCE: out-of-bounds read in stack_trace_save_regs+0x48/0x6c [ 1.531442 ] [ 1.531900 ] Out-of-bounds read at 0xffff800012267fff (1B left of kfence-#12): [ 1.532046 ] stack_trace_save_regs+0x48/0x6c [ 1.532169 ] kfence_report_error+0xa4/0x528 [ 1.532276 ] kfence_handle_page_fault+0x124/0x270 [ 1.532388 ] no_context+0x50/0x94 [ 1.532453 ] do_page_fault+0x1a8/0x36c [ 1.532524 ] tlb_do_page_fault_0+0x118/0x1b4 [ 1.532623 ] test_out_of_bounds_read+0xa0/0x1d8 [ 1.532745 ] kunit_generic_run_threadfn_adapter+0x1c/0x28 [ 1.532854 ] kthread+0x124/0x130 [ 1.532922 ] ret_from_kernel_thread+0xc/0xa4 After this patch: [ 1.320220 ] ================================================================== [ 1.320401 ] BUG: KFENCE: out-of-bounds read in test_out_of_bounds_read+0xa8/0x1d8 [ 1.320401 ] [ 1.320898 ] Out-of-bounds read at 0xffff800012257fff (1B left of kfence-#10): [ 1.321134 ] test_out_of_bounds_read+0xa8/0x1d8 [ 1.321264 ] kunit_generic_run_threadfn_adapter+0x1c/0x28 [ 1.321392 ] kthread+0x124/0x130 [ 1.321459 ] ret_from_kernel_thread+0xc/0xa4 Suggested-by: Jinyang He Signed-off-by: Enze Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/stacktrace.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index 2463d2fea21f..92270f14db94 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -18,17 +18,19 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct pt_regs dummyregs; struct unwind_state state; - regs = &dummyregs; + if (!regs) { + regs = &dummyregs; - if (task == current) { - regs->regs[3] = (unsigned long)__builtin_frame_address(0); - regs->csr_era = (unsigned long)__builtin_return_address(0); - } else { - regs->regs[3] = thread_saved_fp(task); - regs->csr_era = thread_saved_ra(task); + if (task == current) { + regs->regs[3] = (unsigned long)__builtin_frame_address(0); + regs->csr_era = (unsigned long)__builtin_return_address(0); + } else { + regs->regs[3] = thread_saved_fp(task); + regs->csr_era = thread_saved_ra(task); + } + regs->regs[1] = 0; } - regs->regs[1] = 0; for (unwind_start(&state, task, regs); !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); From 6ad3df56bb199134800933df2afcd7df3b03ef33 Mon Sep 17 00:00:00 2001 From: Enze Li Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: [PATCH 20/25] LoongArch: Add KFENCE (Kernel Electric-Fence) support The LoongArch architecture is quite different from other architectures. When the allocating of KFENCE itself is done, it is mapped to the direct mapping configuration window [1] by default on LoongArch. It means that it is not possible to use the page table mapped mode which required by the KFENCE system and therefore it should be remapped to the appropriate region. This patch adds architecture specific implementation details for KFENCE. In particular, this implements the required interface in . Tested this patch by running the testcases and all passed. [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#virtual-address-space-and-address-translation-mode Signed-off-by: Enze Li Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/kfence.h | 61 ++++++++++++++++++++++++++++ arch/loongarch/include/asm/pgtable.h | 11 ++++- arch/loongarch/mm/fault.c | 22 ++++++---- 4 files changed, 86 insertions(+), 9 deletions(-) create mode 100644 arch/loongarch/include/asm/kfence.h diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 6cda4843f93b..0619ec165424 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -92,6 +92,7 @@ config LOONGARCH select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE + select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB if PERF_EVENTS select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h new file mode 100644 index 000000000000..6c82aea1c993 --- /dev/null +++ b/arch/loongarch/include/asm/kfence.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KFENCE support for LoongArch. + * + * Author: Enze Li + * Copyright (C) 2022-2023 KylinSoft Corporation. + */ + +#ifndef _ASM_LOONGARCH_KFENCE_H +#define _ASM_LOONGARCH_KFENCE_H + +#include +#include +#include + +static inline bool arch_kfence_init_pool(void) +{ + int err; + char *kfence_pool = __kfence_pool; + struct vm_struct *area; + + area = __get_vm_area_caller(KFENCE_POOL_SIZE, VM_IOREMAP, + KFENCE_AREA_START, KFENCE_AREA_END, + __builtin_return_address(0)); + if (!area) + return false; + + __kfence_pool = (char *)area->addr; + err = ioremap_page_range((unsigned long)__kfence_pool, + (unsigned long)__kfence_pool + KFENCE_POOL_SIZE, + virt_to_phys((void *)kfence_pool), PAGE_KERNEL); + if (err) { + free_vm_area(area); + __kfence_pool = kfence_pool; + return false; + } + + return true; +} + +/* Protect the given page and flush TLB. */ +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + pte_t *pte = virt_to_kpte(addr); + + if (WARN_ON(!pte) || pte_none(*pte)) + return false; + + if (protect) + set_pte(pte, __pte(pte_val(*pte) & ~(_PAGE_VALID | _PAGE_PRESENT))); + else + set_pte(pte, __pte(pte_val(*pte) | (_PAGE_VALID | _PAGE_PRESENT))); + + preempt_disable(); + local_flush_tlb_one(addr); + preempt_enable(); + + return true; +} + +#endif /* _ASM_LOONGARCH_KFENCE_H */ diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 338d1b147464..7e3708883994 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -82,14 +82,23 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define MODULES_VADDR (vm_map_base + PCI_IOSIZE + (2 * PAGE_SIZE)) #define MODULES_END (MODULES_VADDR + SZ_256M) +#ifdef CONFIG_KFENCE +#define KFENCE_AREA_SIZE (((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 + 2) * PAGE_SIZE) +#else +#define KFENCE_AREA_SIZE 0 +#endif + #define VMALLOC_START MODULES_END #define VMALLOC_END \ (vm_map_base + \ - min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE) + min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE - KFENCE_AREA_SIZE) #define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK)) #define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1) +#define KFENCE_AREA_START (VMEMMAP_END + 1) +#define KFENCE_AREA_END (KFENCE_AREA_START + KFENCE_AREA_SIZE - 1) + #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #ifndef __PAGETABLE_PMD_FOLDED diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c index da5b6d518cdb..e6376e3dce86 100644 --- a/arch/loongarch/mm/fault.c +++ b/arch/loongarch/mm/fault.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -30,7 +31,8 @@ int show_unhandled_signals = 1; -static void __kprobes no_context(struct pt_regs *regs, unsigned long address) +static void __kprobes no_context(struct pt_regs *regs, + unsigned long write, unsigned long address) { const int field = sizeof(unsigned long) * 2; @@ -38,6 +40,9 @@ static void __kprobes no_context(struct pt_regs *regs, unsigned long address) if (fixup_exception(regs)) return; + if (kfence_handle_page_fault(address, write, regs)) + return; + /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. @@ -51,14 +56,15 @@ static void __kprobes no_context(struct pt_regs *regs, unsigned long address) die("Oops", regs); } -static void __kprobes do_out_of_memory(struct pt_regs *regs, unsigned long address) +static void __kprobes do_out_of_memory(struct pt_regs *regs, + unsigned long write, unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace * (which will retry the fault, or kill us if we got oom-killed). */ if (!user_mode(regs)) { - no_context(regs, address); + no_context(regs, write, address); return; } pagefault_out_of_memory(); @@ -69,7 +75,7 @@ static void __kprobes do_sigbus(struct pt_regs *regs, { /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) { - no_context(regs, address); + no_context(regs, write, address); return; } @@ -90,7 +96,7 @@ static void __kprobes do_sigsegv(struct pt_regs *regs, /* Kernel mode? Handle exceptions or die */ if (!user_mode(regs)) { - no_context(regs, address); + no_context(regs, write, address); return; } @@ -149,7 +155,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, */ if (address & __UA_LIMIT) { if (!user_mode(regs)) - no_context(regs, address); + no_context(regs, write, address); else do_sigsegv(regs, write, address, si_code); return; @@ -211,7 +217,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) - no_context(regs, address); + no_context(regs, write, address); return; } @@ -232,7 +238,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, if (unlikely(fault & VM_FAULT_ERROR)) { mmap_read_unlock(mm); if (fault & VM_FAULT_OOM) { - do_out_of_memory(regs, address); + do_out_of_memory(regs, write, address); return; } else if (fault & VM_FAULT_SIGSEGV) { do_sigsegv(regs, write, address, si_code); From 9b04c764af18a1dab6d48ca0671f70cdcccf90a2 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: [PATCH 21/25] kasan: Add __HAVE_ARCH_SHADOW_MAP to support arch specific mapping MIPS, LoongArch and some other architectures have many holes between different segments and the valid address space (256T available) is insufficient to map all these segments to kasan shadow memory with the common formula provided by kasan core. So we need architecture specific mapping formulas to ensure different segments are mapped individually, and only limited space lengths of those specific segments are mapped to shadow. Therefore, when the incoming address is converted to a shadow, we need to add a condition to determine whether it is valid. Reviewed-by: Andrey Konovalov Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- include/linux/kasan.h | 2 ++ mm/kasan/kasan.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 819b6bc8ac08..3df5499f7936 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -54,11 +54,13 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); +#ifndef __HAVE_ARCH_SHADOW_MAP static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } +#endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 2e973b36fe07..f70e3d7a602e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -291,16 +291,22 @@ struct kasan_stack_ring { #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +#ifndef __HAVE_ARCH_SHADOW_MAP static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT); } +#endif static __always_inline bool addr_has_metadata(const void *addr) { +#ifdef __HAVE_ARCH_SHADOW_MAP + return (kasan_mem_to_shadow((void *)addr) != NULL); +#else return (kasan_reset_tag(addr) >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +#endif } /** From fb6d5c1d99ab6958c5e284f7aa5f0cc553f0268c Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: [PATCH 22/25] kasan: Add (pmd|pud)_init for LoongArch zero_(pud|p4d)_populate process LoongArch populates pmd/pud with invalid_pmd_table/invalid_pud_table in pagetable_init, So pmd_init/pud_init(p) is required, define them as __weak in mm/kasan/init.c, like mm/sparse-vmemmap.c. Reviewed-by: Andrey Konovalov Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- mm/kasan/init.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index dcfec277e839..89895f38f722 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -139,6 +139,10 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, return 0; } +void __weak __meminit pmd_init(void *addr) +{ +} + static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { @@ -166,8 +170,9 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, if (!p) return -ENOMEM; } else { - pud_populate(&init_mm, pud, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + pmd_init(p); + pud_populate(&init_mm, pud, p); } } zero_pmd_populate(pud, addr, next); @@ -176,6 +181,10 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, return 0; } +void __weak __meminit pud_init(void *addr) +{ +} + static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { @@ -207,8 +216,9 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, if (!p) return -ENOMEM; } else { - p4d_populate(&init_mm, p4d, - early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + pud_init(p); + p4d_populate(&init_mm, p4d, p); } } zero_pud_populate(p4d, addr, next); From 9fbcc076798ead2af28c854a265d9da83bec8429 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: [PATCH 23/25] LoongArch: Simplify the processing of jumping new kernel for KASLR Modified relocate_kernel() doesn't return new kernel's entry point but the random_offset. In this way we share the start_kernel() processing with the normal kernel, which avoids calling 'jr a0' directly and allows some other operations (e.g, kasan_early_init) before start_kernel() when KASLR (CONFIG_RANDOMIZE_BASE) is turned on. Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/setup.h | 2 +- arch/loongarch/kernel/head.S | 11 ++++++----- arch/loongarch/kernel/relocate.c | 8 ++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h index 7c68b4365a4d..a0bc159ce8bd 100644 --- a/arch/loongarch/include/asm/setup.h +++ b/arch/loongarch/include/asm/setup.h @@ -34,7 +34,7 @@ extern long __la_abs_end; extern long __rela_dyn_begin; extern long __rela_dyn_end; -extern void * __init relocate_kernel(void); +extern unsigned long __init relocate_kernel(void); #endif diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 5e828a8bc0a0..5743fef70dfe 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -95,13 +95,14 @@ SYM_CODE_START(kernel_entry) # kernel entry point PTR_LI sp, (_THREAD_SIZE - PT_SIZE) PTR_ADD sp, sp, tp set_saved_sp sp, t0, t1 -#endif - /* relocate_kernel() returns the new kernel entry point */ - jr a0 - ASM_BUG() + /* Jump to the new kernel: new_pc = current_pc + random_offset */ + pcaddi t0, 0 + add.d t0, t0, a0 + jirl zero, t0, 0xc +#endif /* CONFIG_RANDOMIZE_BASE */ -#endif +#endif /* CONFIG_RELOCATABLE */ bl start_kernel ASM_BUG() diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c index 01f94d1e3edf..6c3eff9af9fb 100644 --- a/arch/loongarch/kernel/relocate.c +++ b/arch/loongarch/kernel/relocate.c @@ -157,12 +157,11 @@ static inline void __init update_reloc_offset(unsigned long *addr, long random_o *new_addr = (unsigned long)reloc_offset; } -void * __init relocate_kernel(void) +unsigned long __init relocate_kernel(void) { unsigned long kernel_length; unsigned long random_offset = 0; void *location_new = _text; /* Default to original kernel start */ - void *kernel_entry = start_kernel; /* Default to original kernel entry point */ char *cmdline = early_ioremap(fw_arg1, COMMAND_LINE_SIZE); /* Boot command line is passed in fw_arg1 */ strscpy(boot_command_line, cmdline, COMMAND_LINE_SIZE); @@ -190,9 +189,6 @@ void * __init relocate_kernel(void) reloc_offset += random_offset; - /* Return the new kernel's entry point */ - kernel_entry = RELOCATED_KASLR(start_kernel); - /* The current thread is now within the relocated kernel */ __current_thread_info = RELOCATED_KASLR(__current_thread_info); @@ -204,7 +200,7 @@ void * __init relocate_kernel(void) relocate_absolute(random_offset); - return kernel_entry; + return random_offset; } /* From 5aa4ac64e6add3e40d5049e31275b2822daf885d Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: [PATCH 24/25] LoongArch: Add KASAN (Kernel Address Sanitizer) support 1/8 of kernel addresses reserved for shadow memory. But for LoongArch, There are a lot of holes between different segments and valid address space (256T available) is insufficient to map all these segments to kasan shadow memory with the common formula provided by kasan core, saying (addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET So LoongArch has a arch-specific mapping formula, different segments are mapped individually, and only limited space lengths of these specific segments are mapped to shadow. At early boot stage the whole shadow region populated with just one physical page (kasan_early_shadow_page). Later, this page is reused as readonly zero shadow for some memory that kasan currently don't track. After mapping the physical memory, pages for shadow memory are allocated and mapped. Functions like memset()/memcpy()/memmove() do a lot of memory accesses. If bad pointer passed to one of these function it is important to be caught. Compiler's instrumentation cannot do this since these functions are written in assembly. KASan replaces memory functions with manually instrumented variants. Original functions declared as weak symbols so strong definitions in mm/kasan/kasan.c could replace them. Original functions have aliases with '__' prefix in names, so we could call non-instrumented variant if needed. Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- Documentation/dev-tools/kasan.rst | 4 +- .../features/debug/KASAN/arch-support.txt | 2 +- .../translations/zh_CN/dev-tools/kasan.rst | 2 +- arch/loongarch/Kconfig | 7 + arch/loongarch/Makefile | 3 + arch/loongarch/include/asm/kasan.h | 126 +++++++++ arch/loongarch/include/asm/pgtable.h | 7 + arch/loongarch/include/asm/string.h | 20 ++ arch/loongarch/kernel/Makefile | 6 + arch/loongarch/kernel/head.S | 4 + arch/loongarch/kernel/setup.c | 4 + arch/loongarch/lib/memcpy.S | 8 +- arch/loongarch/lib/memmove.S | 20 +- arch/loongarch/lib/memset.S | 8 +- arch/loongarch/mm/Makefile | 3 + arch/loongarch/mm/kasan_init.c | 243 ++++++++++++++++++ arch/loongarch/vdso/Makefile | 1 + 17 files changed, 455 insertions(+), 13 deletions(-) create mode 100644 arch/loongarch/include/asm/kasan.h create mode 100644 arch/loongarch/mm/kasan_init.c diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index f4acf9c2e90f..382818a7197a 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -41,8 +41,8 @@ Support Architectures ~~~~~~~~~~~~~ -Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and -xtensa, and the tag-based KASAN modes are supported only on arm64. +Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, xtensa, +and loongarch, and the tag-based KASAN modes are supported only on arm64. Compilers ~~~~~~~~~ diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index bf0124fae643..c4581c2edb28 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -13,7 +13,7 @@ | csky: | TODO | | hexagon: | TODO | | ia64: | TODO | - | loongarch: | TODO | + | loongarch: | ok | | m68k: | TODO | | microblaze: | TODO | | mips: | TODO | diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst index 05ef904dbcfb..8fdb20c9665b 100644 --- a/Documentation/translations/zh_CN/dev-tools/kasan.rst +++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst @@ -42,7 +42,7 @@ KASAN有三种模式: 体系架构 ~~~~~~~~ -在x86_64、arm、arm64、powerpc、riscv、s390和xtensa上支持通用KASAN, +在x86_64、arm、arm64、powerpc、riscv、s390、xtensa和loongarch上支持通用KASAN, 而基于标签的KASAN模式只在arm64上支持。 编译器 diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 0619ec165424..32157188d5b8 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -8,6 +8,7 @@ config LOONGARCH select ACPI_PPTT if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_BINFMT_ELF_STATE + select ARCH_DISABLE_KASAN_INLINE select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI @@ -92,6 +93,7 @@ config LOONGARCH select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE + select HAVE_ARCH_KASAN select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB if PERF_EVENTS select HAVE_ARCH_MMAP_RND_BITS if MMU @@ -669,6 +671,11 @@ config ARCH_MMAP_RND_BITS_MAX config ARCH_SUPPORTS_UPROBES def_bool y +config KASAN_SHADOW_OFFSET + hex + default 0x0 + depends on KASAN + menu "Power management options" config ARCH_SUSPEND_POSSIBLE diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index ef87bab46754..fb0fada43197 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -84,7 +84,10 @@ LDFLAGS_vmlinux += -static -pie --no-dynamic-linker -z notext endif cflags-y += $(call cc-option, -mno-check-zero-division) + +ifndef CONFIG_KASAN cflags-y += -fno-builtin-memcpy -fno-builtin-memmove -fno-builtin-memset +endif load-y = 0x9000000000200000 bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y) diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h new file mode 100644 index 000000000000..deeff8158f45 --- /dev/null +++ b/arch/loongarch/include/asm/kasan.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_KASAN_H +#define __ASM_KASAN_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include +#include + +#define __HAVE_ARCH_SHADOW_MAP + +#define KASAN_SHADOW_SCALE_SHIFT 3 +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) + +#define XRANGE_SHIFT (48) + +/* Valid address length */ +#define XRANGE_SHADOW_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) +/* Used for taking out the valid address */ +#define XRANGE_SHADOW_MASK GENMASK_ULL(XRANGE_SHADOW_SHIFT - 1, 0) +/* One segment whole address space size */ +#define XRANGE_SIZE (XRANGE_SHADOW_MASK + 1) + +/* 64-bit segment value. */ +#define XKPRANGE_UC_SEG (0x8000) +#define XKPRANGE_CC_SEG (0x9000) +#define XKVRANGE_VC_SEG (0xffff) + +/* Cached */ +#define XKPRANGE_CC_START CACHE_BASE +#define XKPRANGE_CC_SIZE XRANGE_SIZE +#define XKPRANGE_CC_KASAN_OFFSET (0) +#define XKPRANGE_CC_SHADOW_SIZE (XKPRANGE_CC_SIZE >> KASAN_SHADOW_SCALE_SHIFT) +#define XKPRANGE_CC_SHADOW_END (XKPRANGE_CC_KASAN_OFFSET + XKPRANGE_CC_SHADOW_SIZE) + +/* UnCached */ +#define XKPRANGE_UC_START UNCACHE_BASE +#define XKPRANGE_UC_SIZE XRANGE_SIZE +#define XKPRANGE_UC_KASAN_OFFSET XKPRANGE_CC_SHADOW_END +#define XKPRANGE_UC_SHADOW_SIZE (XKPRANGE_UC_SIZE >> KASAN_SHADOW_SCALE_SHIFT) +#define XKPRANGE_UC_SHADOW_END (XKPRANGE_UC_KASAN_OFFSET + XKPRANGE_UC_SHADOW_SIZE) + +/* VMALLOC (Cached or UnCached) */ +#define XKVRANGE_VC_START MODULES_VADDR +#define XKVRANGE_VC_SIZE round_up(KFENCE_AREA_END - MODULES_VADDR + 1, PGDIR_SIZE) +#define XKVRANGE_VC_KASAN_OFFSET XKPRANGE_UC_SHADOW_END +#define XKVRANGE_VC_SHADOW_SIZE (XKVRANGE_VC_SIZE >> KASAN_SHADOW_SCALE_SHIFT) +#define XKVRANGE_VC_SHADOW_END (XKVRANGE_VC_KASAN_OFFSET + XKVRANGE_VC_SHADOW_SIZE) + +/* KAsan shadow memory start right after vmalloc. */ +#define KASAN_SHADOW_START round_up(KFENCE_AREA_END, PGDIR_SIZE) +#define KASAN_SHADOW_SIZE (XKVRANGE_VC_SHADOW_END - XKPRANGE_CC_KASAN_OFFSET) +#define KASAN_SHADOW_END round_up(KASAN_SHADOW_START + KASAN_SHADOW_SIZE, PGDIR_SIZE) + +#define XKPRANGE_CC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_CC_KASAN_OFFSET) +#define XKPRANGE_UC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_UC_KASAN_OFFSET) +#define XKVRANGE_VC_SHADOW_OFFSET (KASAN_SHADOW_START + XKVRANGE_VC_KASAN_OFFSET) + +extern bool kasan_early_stage; +extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; + +#define kasan_arch_is_ready kasan_arch_is_ready +static __always_inline bool kasan_arch_is_ready(void) +{ + return !kasan_early_stage; +} + +static inline void *kasan_mem_to_shadow(const void *addr) +{ + if (!kasan_arch_is_ready()) { + return (void *)(kasan_early_shadow_page); + } else { + unsigned long maddr = (unsigned long)addr; + unsigned long xrange = (maddr >> XRANGE_SHIFT) & 0xffff; + unsigned long offset = 0; + + maddr &= XRANGE_SHADOW_MASK; + switch (xrange) { + case XKPRANGE_CC_SEG: + offset = XKPRANGE_CC_SHADOW_OFFSET; + break; + case XKPRANGE_UC_SEG: + offset = XKPRANGE_UC_SHADOW_OFFSET; + break; + case XKVRANGE_VC_SEG: + offset = XKVRANGE_VC_SHADOW_OFFSET; + break; + default: + WARN_ON(1); + return NULL; + } + + return (void *)((maddr >> KASAN_SHADOW_SCALE_SHIFT) + offset); + } +} + +static inline const void *kasan_shadow_to_mem(const void *shadow_addr) +{ + unsigned long addr = (unsigned long)shadow_addr; + + if (unlikely(addr > KASAN_SHADOW_END) || + unlikely(addr < KASAN_SHADOW_START)) { + WARN_ON(1); + return NULL; + } + + if (addr >= XKVRANGE_VC_SHADOW_OFFSET) + return (void *)(((addr - XKVRANGE_VC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKVRANGE_VC_START); + else if (addr >= XKPRANGE_UC_SHADOW_OFFSET) + return (void *)(((addr - XKPRANGE_UC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKPRANGE_UC_START); + else if (addr >= XKPRANGE_CC_SHADOW_OFFSET) + return (void *)(((addr - XKPRANGE_CC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKPRANGE_CC_START); + else { + WARN_ON(1); + return NULL; + } +} + +void kasan_init(void); +asmlinkage void kasan_early_init(void); + +#endif +#endif diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 7e3708883994..e5675ce0907a 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -89,9 +89,16 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #endif #define VMALLOC_START MODULES_END + +#ifndef CONFIG_KASAN #define VMALLOC_END \ (vm_map_base + \ min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE - KFENCE_AREA_SIZE) +#else +#define VMALLOC_END \ + (vm_map_base + \ + min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits) / 2) - PMD_SIZE - VMEMMAP_SIZE - KFENCE_AREA_SIZE) +#endif #define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK)) #define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1) diff --git a/arch/loongarch/include/asm/string.h b/arch/loongarch/include/asm/string.h index 7b29cc9c70aa..5bb5a90d2681 100644 --- a/arch/loongarch/include/asm/string.h +++ b/arch/loongarch/include/asm/string.h @@ -7,11 +7,31 @@ #define __HAVE_ARCH_MEMSET extern void *memset(void *__s, int __c, size_t __count); +extern void *__memset(void *__s, int __c, size_t __count); #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *__to, __const__ void *__from, size_t __n); +extern void *__memcpy(void *__to, __const__ void *__from, size_t __n); #define __HAVE_ARCH_MEMMOVE extern void *memmove(void *__dest, __const__ void *__src, size_t __n); +extern void *__memmove(void *__dest, __const__ void *__src, size_t __n); + +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) + +/* + * For files that are not instrumented (e.g. mm/slub.c) we + * should use not instrumented version of mem* functions. + */ + +#define memset(s, c, n) __memset(s, c, n) +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + +#endif #endif /* _ASM_STRING_H */ diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index c0b88528cb81..c56ea0b75448 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -34,6 +34,12 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_rethook_trampoline.o = $(CC_FLAGS_FTRACE) endif +KASAN_SANITIZE_efi.o := n +KASAN_SANITIZE_cpu-probe.o := n +KASAN_SANITIZE_traps.o := n +KASAN_SANITIZE_smp.o := n +KASAN_SANITIZE_vdso.o := n + obj-$(CONFIG_MODULES) += module.o module-sections.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 5743fef70dfe..53b883db0786 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -104,6 +104,10 @@ SYM_CODE_START(kernel_entry) # kernel entry point #endif /* CONFIG_RELOCATABLE */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif + bl start_kernel ASM_BUG() diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 9d830ab4e302..7783f0a3d742 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -626,4 +626,8 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); + +#ifdef CONFIG_KASAN + kasan_init(); +#endif } diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S index cc30b3b6252f..fa1148878d2b 100644 --- a/arch/loongarch/lib/memcpy.S +++ b/arch/loongarch/lib/memcpy.S @@ -10,6 +10,8 @@ #include #include +.section .noinstr.text, "ax" + SYM_FUNC_START(memcpy) /* * Some CPUs support hardware unaligned access @@ -17,9 +19,13 @@ SYM_FUNC_START(memcpy) ALTERNATIVE "b __memcpy_generic", \ "b __memcpy_fast", CPU_FEATURE_UAL SYM_FUNC_END(memcpy) -_ASM_NOKPROBE(memcpy) +SYM_FUNC_ALIAS(__memcpy, memcpy) EXPORT_SYMBOL(memcpy) +EXPORT_SYMBOL(__memcpy) + +_ASM_NOKPROBE(memcpy) +_ASM_NOKPROBE(__memcpy) /* * void *__memcpy_generic(void *dst, const void *src, size_t n) diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S index 7dc76d1484b6..82dae062fec8 100644 --- a/arch/loongarch/lib/memmove.S +++ b/arch/loongarch/lib/memmove.S @@ -10,23 +10,29 @@ #include #include +.section .noinstr.text, "ax" + SYM_FUNC_START(memmove) - blt a0, a1, memcpy /* dst < src, memcpy */ - blt a1, a0, rmemcpy /* src < dst, rmemcpy */ - jr ra /* dst == src, return */ + blt a0, a1, __memcpy /* dst < src, memcpy */ + blt a1, a0, __rmemcpy /* src < dst, rmemcpy */ + jr ra /* dst == src, return */ SYM_FUNC_END(memmove) -_ASM_NOKPROBE(memmove) +SYM_FUNC_ALIAS(__memmove, memmove) EXPORT_SYMBOL(memmove) +EXPORT_SYMBOL(__memmove) -SYM_FUNC_START(rmemcpy) +_ASM_NOKPROBE(memmove) +_ASM_NOKPROBE(__memmove) + +SYM_FUNC_START(__rmemcpy) /* * Some CPUs support hardware unaligned access */ ALTERNATIVE "b __rmemcpy_generic", \ "b __rmemcpy_fast", CPU_FEATURE_UAL -SYM_FUNC_END(rmemcpy) -_ASM_NOKPROBE(rmemcpy) +SYM_FUNC_END(__rmemcpy) +_ASM_NOKPROBE(__rmemcpy) /* * void *__rmemcpy_generic(void *dst, const void *src, size_t n) diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S index 3f20f7996e8e..06d3ca54cbfe 100644 --- a/arch/loongarch/lib/memset.S +++ b/arch/loongarch/lib/memset.S @@ -16,6 +16,8 @@ bstrins.d \r0, \r0, 63, 32 .endm +.section .noinstr.text, "ax" + SYM_FUNC_START(memset) /* * Some CPUs support hardware unaligned access @@ -23,9 +25,13 @@ SYM_FUNC_START(memset) ALTERNATIVE "b __memset_generic", \ "b __memset_fast", CPU_FEATURE_UAL SYM_FUNC_END(memset) -_ASM_NOKPROBE(memset) +SYM_FUNC_ALIAS(__memset, memset) EXPORT_SYMBOL(memset) +EXPORT_SYMBOL(__memset) + +_ASM_NOKPROBE(memset) +_ASM_NOKPROBE(__memset) /* * void *__memset_generic(void *s, int c, size_t n) diff --git a/arch/loongarch/mm/Makefile b/arch/loongarch/mm/Makefile index 8ffc6383f836..e4d1e581dbae 100644 --- a/arch/loongarch/mm/Makefile +++ b/arch/loongarch/mm/Makefile @@ -7,3 +7,6 @@ obj-y += init.o cache.o tlb.o tlbex.o extable.o \ fault.o ioremap.o maccess.o mmap.o pgtable.o page.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_KASAN) += kasan_init.o + +KASAN_SANITIZE_kasan_init.o := n diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c new file mode 100644 index 000000000000..da68bc1a4643 --- /dev/null +++ b/arch/loongarch/mm/kasan_init.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Loongson Technology Corporation Limited + */ +#define pr_fmt(fmt) "kasan: " fmt +#include +#include +#include + +#include +#include +#include + +static pgd_t kasan_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); + +#ifdef __PAGETABLE_PUD_FOLDED +#define __p4d_none(early, p4d) (0) +#else +#define __p4d_none(early, p4d) (early ? (p4d_val(p4d) == 0) : \ +(__pa(p4d_val(p4d)) == (unsigned long)__pa(kasan_early_shadow_pud))) +#endif + +#ifdef __PAGETABLE_PMD_FOLDED +#define __pud_none(early, pud) (0) +#else +#define __pud_none(early, pud) (early ? (pud_val(pud) == 0) : \ +(__pa(pud_val(pud)) == (unsigned long)__pa(kasan_early_shadow_pmd))) +#endif + +#define __pmd_none(early, pmd) (early ? (pmd_val(pmd) == 0) : \ +(__pa(pmd_val(pmd)) == (unsigned long)__pa(kasan_early_shadow_pte))) + +#define __pte_none(early, pte) (early ? pte_none(pte) : \ +((pte_val(pte) & _PFN_MASK) == (unsigned long)__pa(kasan_early_shadow_page))) + +bool kasan_early_stage = true; + +/* + * Alloc memory for shadow memory page table. + */ +static phys_addr_t __init kasan_alloc_zeroed_page(int node) +{ + void *p = memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node); + if (!p) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%llx\n", + __func__, PAGE_SIZE, PAGE_SIZE, node, __pa(MAX_DMA_ADDRESS)); + + return __pa(p); +} + +static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early) +{ + if (__pmd_none(early, READ_ONCE(*pmdp))) { + phys_addr_t pte_phys = early ? + __pa_symbol(kasan_early_shadow_pte) : kasan_alloc_zeroed_page(node); + if (!early) + memcpy(__va(pte_phys), kasan_early_shadow_pte, sizeof(kasan_early_shadow_pte)); + pmd_populate_kernel(NULL, pmdp, (pte_t *)__va(pte_phys)); + } + + return pte_offset_kernel(pmdp, addr); +} + +static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early) +{ + if (__pud_none(early, READ_ONCE(*pudp))) { + phys_addr_t pmd_phys = early ? + __pa_symbol(kasan_early_shadow_pmd) : kasan_alloc_zeroed_page(node); + if (!early) + memcpy(__va(pmd_phys), kasan_early_shadow_pmd, sizeof(kasan_early_shadow_pmd)); + pud_populate(&init_mm, pudp, (pmd_t *)__va(pmd_phys)); + } + + return pmd_offset(pudp, addr); +} + +static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) +{ + if (__p4d_none(early, READ_ONCE(*p4dp))) { + phys_addr_t pud_phys = early ? + __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); + if (!early) + memcpy(__va(pud_phys), kasan_early_shadow_pud, sizeof(kasan_early_shadow_pud)); + p4d_populate(&init_mm, p4dp, (pud_t *)__va(pud_phys)); + } + + return pud_offset(p4dp, addr); +} + +static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + pte_t *ptep = kasan_pte_offset(pmdp, addr, node, early); + + do { + phys_addr_t page_phys = early ? + __pa_symbol(kasan_early_shadow_page) + : kasan_alloc_zeroed_page(node); + next = addr + PAGE_SIZE; + set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); + } while (ptep++, addr = next, addr != end && __pte_none(early, READ_ONCE(*ptep))); +} + +static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + pmd_t *pmdp = kasan_pmd_offset(pudp, addr, node, early); + + do { + next = pmd_addr_end(addr, end); + kasan_pte_populate(pmdp, addr, next, node, early); + } while (pmdp++, addr = next, addr != end && __pmd_none(early, READ_ONCE(*pmdp))); +} + +static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early); + + do { + next = pud_addr_end(addr, end); + kasan_pmd_populate(pudp, addr, next, node, early); + } while (pudp++, addr = next, addr != end); +} + +static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr, + unsigned long end, int node, bool early) +{ + unsigned long next; + p4d_t *p4dp = p4d_offset(pgdp, addr); + + do { + next = p4d_addr_end(addr, end); + kasan_pud_populate(p4dp, addr, next, node, early); + } while (p4dp++, addr = next, addr != end); +} + +static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, + int node, bool early) +{ + unsigned long next; + pgd_t *pgdp; + + pgdp = pgd_offset_k(addr); + + do { + next = pgd_addr_end(addr, end); + kasan_p4d_populate(pgdp, addr, next, node, early); + } while (pgdp++, addr = next, addr != end); + +} + +/* Set up full kasan mappings, ensuring that the mapped pages are zeroed */ +static void __init kasan_map_populate(unsigned long start, unsigned long end, + int node) +{ + kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false); +} + +asmlinkage void __init kasan_early_init(void) +{ + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); +} + +static inline void kasan_set_pgd(pgd_t *pgdp, pgd_t pgdval) +{ + WRITE_ONCE(*pgdp, pgdval); +} + +static void __init clear_pgds(unsigned long start, unsigned long end) +{ + /* + * Remove references to kasan page tables from + * swapper_pg_dir. pgd_clear() can't be used + * here because it's nop on 2,3-level pagetable setups + */ + for (; start < end; start += PGDIR_SIZE) + kasan_set_pgd((pgd_t *)pgd_offset_k(start), __pgd(0)); +} + +void __init kasan_init(void) +{ + u64 i; + phys_addr_t pa_start, pa_end; + + /* + * PGD was populated as invalid_pmd_table or invalid_pud_table + * in pagetable_init() which depends on how many levels of page + * table you are using, but we had to clean the gpd of kasan + * shadow memory, as the pgd value is none-zero. + * The assertion pgd_none is going to be false and the formal populate + * afterwards is not going to create any new pgd at all. + */ + memcpy(kasan_pg_dir, swapper_pg_dir, sizeof(kasan_pg_dir)); + csr_write64(__pa_symbol(kasan_pg_dir), LOONGARCH_CSR_PGDH); + local_flush_tlb_all(); + + clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + + /* Maps everything to a single page of zeroes */ + kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, true); + + kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)KFENCE_AREA_END)); + + kasan_early_stage = false; + + /* Populate the linear mapping */ + for_each_mem_range(i, &pa_start, &pa_end) { + void *start = (void *)phys_to_virt(pa_start); + void *end = (void *)phys_to_virt(pa_end); + + if (start >= end) + break; + + kasan_map_populate((unsigned long)kasan_mem_to_shadow(start), + (unsigned long)kasan_mem_to_shadow(end), NUMA_NO_NODE); + } + + /* Populate modules mapping */ + kasan_map_populate((unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR), + (unsigned long)kasan_mem_to_shadow((void *)MODULES_END), NUMA_NO_NODE); + /* + * KAsan may reuse the contents of kasan_early_shadow_pte directly, so we + * should make sure that it maps the zero page read-only. + */ + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(&kasan_early_shadow_pte[i], + pfn_pte(__phys_to_pfn(__pa_symbol(kasan_early_shadow_page)), PAGE_KERNEL_RO)); + + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + csr_write64(__pa_symbol(swapper_pg_dir), LOONGARCH_CSR_PGDH); + local_flush_tlb_all(); + + /* At this point kasan is fully initialized. Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KernelAddressSanitizer initialized.\n"); +} diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index d8b75f07c869..5c97d1463328 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Objects to go into the VDSO. +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Include the generic Makefile to check the built vdso. From 671eae93ae2090d2df01d810d354cab05f6bed8b Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 7 Sep 2023 12:06:20 +0800 Subject: [PATCH 25/25] LoongArch: Update Loongson-3 default config file 1, Enable LSX and LASX. 2, Enable KASLR (CONFIG_RANDOMIZE_BASE). 3, Enable jump label (patching mechanism for static key). 4, Enable LoongArch CRC32(c) Acceleration. 5, Enable Loongson-specific drivers: I2C/RTC/DRM/SOC/CLK/PINCTRL/GPIO/SPI. 6, Enable EXFAT/NTFS3/JFS/GFS2/OCFS2/UBIFS/EROFS/CEPH file systems. 7, Enable WangXun NGBE/TXGBE NIC drivers. 8, Enable some IPVS options. 9, Remove CONFIG_SYSFS_DEPRECATED since it is removed in Kconfig. 10, Remove CONFIG_IP_NF_TARGET_CLUSTERIP since it is removed in Kconfig. 11, Remove CONFIG_NFT_OBJREF since it is removed in Kconfig. 12, Remove CONFIG_R8188EU since it is replaced by CONFIG_RTL8XXXU. Signed-off-by: Trevor Woerner Signed-off-by: Xuewen Wang Signed-off-by: Huacai Chen --- arch/loongarch/configs/loongson3_defconfig | 74 ++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index d64849b4cba1..a3b52aaa83b3 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -30,7 +30,6 @@ CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y -CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y @@ -47,8 +46,12 @@ CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y CONFIG_NR_CPUS=64 CONFIG_NUMA=y +CONFIG_CPU_HAS_FPU=y +CONFIG_CPU_HAS_LSX=y +CONFIG_CPU_HAS_LASX=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +CONFIG_RANDOMIZE_BASE=y CONFIG_SUSPEND=y CONFIG_HIBERNATION=y CONFIG_ACPI=y @@ -63,6 +66,7 @@ CONFIG_EFI_ZBOOT=y CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y CONFIG_EFI_CAPSULE_LOADER=m CONFIG_EFI_TEST=m +CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y @@ -108,7 +112,12 @@ CONFIG_IP_PNP_BOOTP=y CONFIG_IP_PNP_RARP=y CONFIG_NET_IPIP=m CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y CONFIG_INET_ESP=m CONFIG_INET_UDP_DIAG=y CONFIG_TCP_CONG_ADVANCED=y @@ -137,7 +146,6 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_TUNNEL=m -CONFIG_NFT_OBJREF=m CONFIG_NFT_QUEUE=m CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m @@ -208,7 +216,11 @@ CONFIG_IP_VS=m CONFIG_IP_VS_IPV6=y CONFIG_IP_VS_PROTO_TCP=y CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m CONFIG_IP_VS_NFCT=y CONFIG_NF_TABLES_IPV4=y CONFIG_NFT_DUP_IPV4=m @@ -227,7 +239,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_TARGET_NETMAP=m CONFIG_IP_NF_TARGET_REDIRECT=m CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m CONFIG_IP_NF_TARGET_ECN=m CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m @@ -363,6 +374,8 @@ CONFIG_MTD_CFI_AMDSTD=m CONFIG_MTD_CFI_STAA=m CONFIG_MTD_RAM=m CONFIG_MTD_ROM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_BLOCK=y CONFIG_PARPORT=y CONFIG_PARPORT_PC=y CONFIG_PARPORT_SERIAL=y @@ -370,6 +383,7 @@ CONFIG_PARPORT_PC_FIFO=y CONFIG_ZRAM=m CONFIG_ZRAM_DEF_COMP_ZSTD=y CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_DRBD=m CONFIG_BLK_DEV_NBD=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 @@ -516,6 +530,8 @@ CONFIG_STMMAC_ETH=y # CONFIG_NET_VENDOR_TEHUTI is not set # CONFIG_NET_VENDOR_TI is not set # CONFIG_NET_VENDOR_VIA is not set +CONFIG_NGBE=y +CONFIG_TXGBE=y # CONFIG_NET_VENDOR_WIZNET is not set # CONFIG_NET_VENDOR_XILINX is not set CONFIG_PPP=m @@ -602,9 +618,15 @@ CONFIG_HW_RANDOM_VIRTIO=m CONFIG_I2C_CHARDEV=y CONFIG_I2C_PIIX4=y CONFIG_I2C_GPIO=y +CONFIG_I2C_LS2X=y CONFIG_SPI=y +CONFIG_SPI_LOONGSON_PCI=m +CONFIG_SPI_LOONGSON_PLATFORM=m +CONFIG_PINCTRL=y +CONFIG_PINCTRL_LOONGSON2=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_LOONGSON=y +CONFIG_GPIO_LOONGSON_64BIT=y CONFIG_POWER_RESET=y CONFIG_POWER_RESET_RESTART=y CONFIG_POWER_RESET_SYSCON=y @@ -614,6 +636,7 @@ CONFIG_SENSORS_LM75=m CONFIG_SENSORS_LM93=m CONFIG_SENSORS_W83795=m CONFIG_SENSORS_W83627HF=m +CONFIG_LOONGSON2_THERMAL=m CONFIG_RC_CORE=m CONFIG_LIRC=y CONFIG_RC_DECODERS=y @@ -643,6 +666,7 @@ CONFIG_DRM_AMDGPU_USERPTR=y CONFIG_DRM_AST=y CONFIG_DRM_QXL=m CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_LOONGSON=y CONFIG_FB=y CONFIG_FB_EFI=y CONFIG_FB_RADEON=y @@ -712,6 +736,7 @@ CONFIG_UCSI_ACPI=m CONFIG_INFINIBAND=m CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y +CONFIG_RTC_DRV_LOONGSON=y CONFIG_DMADEVICES=y CONFIG_UIO=m CONFIG_UIO_PDRV_GENIRQ=m @@ -745,7 +770,9 @@ CONFIG_COMEDI_NI_LABPC_PCI=m CONFIG_COMEDI_NI_PCIDIO=m CONFIG_COMEDI_NI_PCIMIO=m CONFIG_STAGING=y -CONFIG_R8188EU=m +CONFIG_COMMON_CLK_LOONGSON2=y +CONFIG_LOONGSON2_GUTS=y +CONFIG_LOONGSON2_PM=y CONFIG_PM_DEVFREQ=y CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y CONFIG_DEVFREQ_GOV_PERFORMANCE=y @@ -759,10 +786,17 @@ CONFIG_EXT2_FS_SECURITY=y CONFIG_EXT3_FS=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y CONFIG_XFS_FS=y CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m CONFIG_BTRFS_FS=y +CONFIG_BTRFS_FS_POSIX_ACL=y CONFIG_FANOTIFY=y CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y CONFIG_QUOTA=y @@ -771,11 +805,14 @@ CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m CONFIG_AUTOFS_FS=y CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m CONFIG_OVERLAY_FS=y CONFIG_OVERLAY_FS_INDEX=y CONFIG_OVERLAY_FS_XINO_AUTO=y CONFIG_OVERLAY_FS_METACOPY=y CONFIG_FSCACHE=y +CONFIG_CACHEFILES=m CONFIG_ISO9660_FS=y CONFIG_JOLIET=y CONFIG_ZISOFS=y @@ -784,19 +821,42 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=936 CONFIG_FAT_DEFAULT_IOCHARSET="gb2312" +CONFIG_EXFAT_FS=m +CONFIG_NTFS3_FS=m +CONFIG_NTFS3_64BIT_CLUSTER=y +CONFIG_NTFS3_LZX_XPRESS=y CONFIG_PROC_KCORE=y CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_CONFIGFS_FS=y +CONFIG_ORANGEFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_ECRYPT_FS_MESSAGING=y CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m +CONFIG_UBIFS_FS=m +CONFIG_UBIFS_FS_ADVANCED_COMPR=y CONFIG_CRAMFS=m CONFIG_SQUASHFS=y CONFIG_SQUASHFS_XATTR=y CONFIG_SQUASHFS_LZ4=y CONFIG_SQUASHFS_LZO=y CONFIG_SQUASHFS_XZ=y +CONFIG_MINIX_FS=m +CONFIG_ROMFS_FS=m +CONFIG_PSTORE=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +CONFIG_PSTORE_842_COMPRESS=y +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +CONFIG_EROFS_FS=m +CONFIG_EROFS_FS_ZIP_LZMA=y +CONFIG_EROFS_FS_PCPU_KTHREAD=y CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y @@ -807,6 +867,10 @@ CONFIG_NFSD=y CONFIG_NFSD_V3_ACL=y CONFIG_NFSD_V4=y CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y CONFIG_CIFS=m # CONFIG_CIFS_DEBUG is not set CONFIG_9P_FS=y @@ -814,6 +878,7 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_936=y CONFIG_NLS_ASCII=y CONFIG_NLS_UTF8=y +CONFIG_DLM=m CONFIG_KEY_DH_OPERATIONS=y CONFIG_SECURITY=y CONFIG_SECURITY_SELINUX=y @@ -847,6 +912,7 @@ CONFIG_CRYPTO_USER_API_HASH=m CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m +CONFIG_CRYPTO_CRC32_LOONGARCH=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_PRINTK_TIME=y CONFIG_STRIP_ASM_SYMS=y