From a2b05b7aa60e1e9b60faf01dfb1cca35638d1ab1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 May 2017 21:24:41 +1000 Subject: [PATCH 01/25] powerpc/64s: Add dt_cpu_ftrs boot time setup option Provide a dt_cpu_ftrs= cmdline option to disable the dt_cpu_ftrs CPU feature discovery, and fall back to the "cputable" based version. Also allow control of advertising unknown features to userspace and with this parameter, and remove the clunky CONFIG option. Signed-off-by: Nicholas Piggin [mpe: Add explicit early check of bootargs in dt_cpu_ftrs_init()] Signed-off-by: Michael Ellerman --- .../admin-guide/kernel-parameters.txt | 9 +++ arch/powerpc/Kconfig | 5 -- arch/powerpc/kernel/dt_cpu_ftrs.c | 57 ++++++++++++++++--- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 15f79c27748d..0f5c3b4347c6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -866,6 +866,15 @@ dscc4.setup= [NET] + dt_cpu_ftrs= [PPC] + Format: {"off" | "known"} + Control how the dt_cpu_ftrs device-tree binding is + used for CPU feature discovery and setup (if it + exists). + off: Do not use it, fall back to legacy cpu table. + known: Do not pass through unknown features to guests + or userspace, only those that the kernel is aware of. + dump_apple_properties [X86] Dump name and content of EFI device properties on x86 Macs. Useful for driver authors to determine diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f7c8f9972f61..4a4a05afcaf7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -391,11 +391,6 @@ config PPC_DT_CPU_FTRS firmware provides this binding. If you're not sure say Y. -config PPC_CPUFEATURES_ENABLE_UNKNOWN - bool "cpufeatures pass through unknown features to guest/userspace" - depends on PPC_DT_CPU_FTRS - default y - config HIGHMEM bool "High memory support" depends on PPC32 diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index fcc7588a96d6..76eebba13a1d 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -671,12 +672,24 @@ static struct dt_cpu_feature_match __initdata {"wait-v3", feat_enable, 0}, }; -/* XXX: how to configure this? Default + boot time? */ -#ifdef CONFIG_PPC_CPUFEATURES_ENABLE_UNKNOWN -#define CPU_FEATURE_ENABLE_UNKNOWN 1 -#else -#define CPU_FEATURE_ENABLE_UNKNOWN 0 -#endif +static bool __initdata using_dt_cpu_ftrs; +static bool __initdata enable_unknown = true; + +static int __init dt_cpu_ftrs_parse(char *str) +{ + if (!str) + return 0; + + if (!strcmp(str, "off")) + using_dt_cpu_ftrs = false; + else if (!strcmp(str, "known")) + enable_unknown = false; + else + return 1; + + return 0; +} +early_param("dt_cpu_ftrs", dt_cpu_ftrs_parse); static void __init cpufeatures_setup_start(u32 isa) { @@ -707,7 +720,7 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) } } - if (!known && CPU_FEATURE_ENABLE_UNKNOWN) { + if (!known && enable_unknown) { if (!feat_try_enable_unknown(f)) { pr_info("not enabling: %s (unknown and unsupported by kernel)\n", f->name); @@ -756,6 +769,26 @@ static void __init cpufeatures_setup_finished(void) cur_cpu_spec->cpu_features, cur_cpu_spec->mmu_features); } +static int __init disabled_on_cmdline(void) +{ + unsigned long root, chosen; + const char *p; + + root = of_get_flat_dt_root(); + chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); + if (chosen == -FDT_ERR_NOTFOUND) + return false; + + p = of_get_flat_dt_prop(chosen, "bootargs", NULL); + if (!p) + return false; + + if (strstr(p, "dt_cpu_ftrs=off")) + return true; + + return false; +} + static int __init fdt_find_cpu_features(unsigned long node, const char *uname, int depth, void *data) { @@ -766,8 +799,6 @@ static int __init fdt_find_cpu_features(unsigned long node, const char *uname, return 0; } -static bool __initdata using_dt_cpu_ftrs = false; - bool __init dt_cpu_ftrs_in_use(void) { return using_dt_cpu_ftrs; @@ -775,6 +806,8 @@ bool __init dt_cpu_ftrs_in_use(void) bool __init dt_cpu_ftrs_init(void *fdt) { + using_dt_cpu_ftrs = false; + /* Setup and verify the FDT, if it fails we just bail */ if (!early_init_dt_verify(fdt)) return false; @@ -782,6 +815,9 @@ bool __init dt_cpu_ftrs_init(void *fdt) if (!of_scan_flat_dt(fdt_find_cpu_features, NULL)) return false; + if (disabled_on_cmdline()) + return false; + cpufeatures_setup_cpu(); using_dt_cpu_ftrs = true; @@ -1027,5 +1063,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char void __init dt_cpu_ftrs_scan(void) { + if (!using_dt_cpu_ftrs) + return; + of_scan_flat_dt(dt_cpu_ftrs_scan_callback, NULL); } From 99acc9bede06bbb2662aafff51f5b9e529fa845e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 29 May 2017 20:26:07 +1000 Subject: [PATCH 02/25] powerpc/spufs: Fix coredump of SPU contexts If a process dumps core while it has SPU contexts active then we have code to also dump information about the SPU contexts. Unfortunately it's been broken for 3 1/2 years, and we didn't notice. In commit 7b1f4020d0d1 ("spufs: get rid of dump_emit() wrappers") the nread variable was removed and rc used instead. That means when the loop exits successfully, rc has the number of bytes read, but it's then used as the return value for the function, which should return 0 on success. So fix it by setting rc = 0 before returning in the success case. Fixes: 7b1f4020d0d1 ("spufs: get rid of dump_emit() wrappers") Signed-off-by: Michael Ellerman Acked-by: Jeremy Kerr Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/cell/spufs/coredump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index e5a891ae80ee..84b7ac926ce6 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -175,6 +175,8 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i, skip = roundup(cprm->pos - total + sz, 4) - cprm->pos; if (!dump_skip(cprm, skip)) goto Eio; + + rc = 0; out: free_page((unsigned long)buf); return rc; From 6f553912eedafae13ff20b322a65e471fe7f5236 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 24 May 2017 10:01:55 +0200 Subject: [PATCH 03/25] powerpc/sysdev/simple_gpio: Fix oops in gpio save_regs function of_mm_gpiochip_add_data() generates an oops for NULL pointer dereference. of_mm_gpiochip_add_data() calls mm_gc->save_regs() before setting the data, therefore ->save_regs() cannot use gpiochip_get_data() Fixes: 937daafca774 ("powerpc: simple-gpio: use gpiochip data pointer") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Christophe Leroy Reviewed-by: Linus Walleij Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/simple_gpio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c index ef470b470b04..6afddae2fb47 100644 --- a/arch/powerpc/sysdev/simple_gpio.c +++ b/arch/powerpc/sysdev/simple_gpio.c @@ -75,7 +75,8 @@ static int u8_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static void u8_gpio_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct u8_gpio_chip *u8_gc = gpiochip_get_data(&mm_gc->gc); + struct u8_gpio_chip *u8_gc = + container_of(mm_gc, struct u8_gpio_chip, mm_gc); u8_gc->data = in_8(mm_gc->regs); } From dc421b200f91930c9c6a9586810ff8c232cf10fc Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Mon, 22 May 2017 15:44:37 -0500 Subject: [PATCH 04/25] powerpc/hotplug-mem: Fix missing endian conversion of aa_index When adding or removing memory, the aa_index (affinity value) for the memblock must also be converted to match the endianness of the rest of the 'ibm,dynamic-memory' property. Otherwise, subsequent retrieval of the attribute will likely lead to non-existent nodes, followed by using the default node in the code inappropriately. Fixes: 5f97b2a0d176 ("powerpc/pseries: Implement memory hotplug add in the kernel") Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/hotplug-memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index e104c71ea44a..1fb162ba9d1c 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -124,6 +124,7 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn) for (i = 0; i < num_lmbs; i++) { lmbs[i].base_addr = be64_to_cpu(lmbs[i].base_addr); lmbs[i].drc_index = be32_to_cpu(lmbs[i].drc_index); + lmbs[i].aa_index = be32_to_cpu(lmbs[i].aa_index); lmbs[i].flags = be32_to_cpu(lmbs[i].flags); } @@ -147,6 +148,7 @@ static void dlpar_update_drconf_property(struct device_node *dn, for (i = 0; i < num_lmbs; i++) { lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); + lmbs[i].aa_index = cpu_to_be32(lmbs[i].aa_index); lmbs[i].flags = cpu_to_be32(lmbs[i].flags); } From 0e5e7f5e9700661c3ddd95501743fb52fec1ab07 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 25 May 2017 16:33:52 +1000 Subject: [PATCH 05/25] powerpc/64: Reclaim CPU_FTR_SUBCORE We are running low on CPU feature bits, so we only want to use them when it's really necessary. CPU_FTR_SUBCORE is only used in one place, and only in C, so we don't need it in order to make asm patching work. It can only be set on "Power8" CPUs, which in practice means POWER8, POWER8E and POWER8NVL. There are no plans to implement it on future CPUs, but if there ever were we could retrofit it then. Although KVM uses subcores, it never looks at the CPU feature, it either looks at the ISA level or the threads_per_subcore value. So drop the CPU feature and do a PVR check instead. Drop the device tree "subcore" feature as we no longer support doing anything with it, and we will drop it from skiboot too. Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cputable.h | 3 +-- arch/powerpc/kernel/dt_cpu_ftrs.c | 1 - arch/powerpc/platforms/powernv/subcore.c | 8 +++++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index c2d509584a98..d02ad93bf708 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -214,7 +214,6 @@ enum { #define CPU_FTR_DAWR LONG_ASM_CONST(0x0400000000000000) #define CPU_FTR_DABRX LONG_ASM_CONST(0x0800000000000000) #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000000000000000) -#define CPU_FTR_SUBCORE LONG_ASM_CONST(0x2000000000000000) #define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x4000000000000000) #ifndef __ASSEMBLY__ @@ -463,7 +462,7 @@ enum { CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \ - CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_SUBCORE) + CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP) #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG) #define CPU_FTRS_POWER8_DD1 (CPU_FTRS_POWER8 & ~CPU_FTR_DBELL) #define CPU_FTRS_POWER9 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 76eebba13a1d..4c7656dc4e04 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -643,7 +643,6 @@ static struct dt_cpu_feature_match __initdata {"processor-control-facility", feat_enable_dbell, CPU_FTR_DBELL}, {"processor-control-facility-v3", feat_enable_dbell, CPU_FTR_DBELL}, {"processor-utilization-of-resources-register", feat_enable_purr, 0}, - {"subcore", feat_enable, CPU_FTR_SUBCORE}, {"no-execute", feat_enable, 0}, {"strong-access-ordering", feat_enable, CPU_FTR_SAO}, {"cache-inhibited-large-page", feat_enable_large_ci, 0}, diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 0babef11136f..8c6119280c13 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -407,7 +407,13 @@ static DEVICE_ATTR(subcores_per_core, 0644, static int subcore_init(void) { - if (!cpu_has_feature(CPU_FTR_SUBCORE)) + unsigned pvr_ver; + + pvr_ver = PVR_VER(mfspr(SPRN_PVR)); + + if (pvr_ver != PVR_POWER8 && + pvr_ver != PVR_POWER8E && + pvr_ver != PVR_POWER8NVL) return 0; /* From 1195892c091a15cc862f4e202482a36adc924e12 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 2 Jun 2017 18:43:30 -0300 Subject: [PATCH 06/25] powerpc/kernel: Fix FP and vector register restoration Currently tsk->thread->load_vec and load_fp are not initialized during task creation, which can lead to garbage values in these variables (non-zero values). These variables will be checked later in restore_math() to validate if the FP and vector registers are being utilized. Since these values might be non-zero, the restore_math() will continue to save the FP and vectors even if they were never utilized by the userspace application. load_fp and load_vec counters will then overflow (they wrap at 255) and the FP and Altivec will be finally disabled, but before that condition is reached (counter overflow) several context switches will have restored FP and vector registers without need, causing a performance degradation. Fixes: 70fe3d980f5f ("powerpc: Restore FPU/VEC/VSX if previously used") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Breno Leitao Signed-off-by: Gustavo Romero Acked-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index baae104b16c7..a9435397eab8 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1666,6 +1666,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; #ifdef CONFIG_ALTIVEC @@ -1674,6 +1675,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.vr_save_area = NULL; current->thread.vrsave = 0; current->thread.used_vr = 0; + current->thread.load_vec = 0; #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_SPE memset(current->thread.evr, 0, sizeof(current->thread.evr)); From 7f22ced4377628074e2ac25f41a88f98eb3b03f1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 5 Jun 2017 11:40:59 -0300 Subject: [PATCH 07/25] powerpc/kernel: Initialize load_tm on task creation Currently tsk->thread.load_tm is not initialized in the task creation and can contain garbage on a new task. This is an undesired behaviour, since it affects the timing to enable and disable the transactional memory laziness (disabling and enabling the MSR TM bit, which affects TM reclaim and recheckpoint in the scheduling process). Fixes: 5d176f751ee3 ("powerpc: tm: Enable transactional memory (TM) lazily for userspace") Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a9435397eab8..2ad725ef4368 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1687,6 +1687,7 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.tm_tfhar = 0; current->thread.tm_texasr = 0; current->thread.tm_tfiar = 0; + current->thread.load_tm = 0; #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ } EXPORT_SYMBOL(start_thread); From b3aa20ba2ba8072b73bd799605b8c98927b7056c Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Fri, 2 Jun 2017 22:26:48 +0530 Subject: [PATCH 08/25] cxl: Avoid double free_irq() for psl,slice interrupts During an eeh call to cxl_remove can result in double free_irq of psl,slice interrupts. This can happen if perst_reloads_same_image == 1 and call to cxl_configure_adapter() fails during slot_reset callback. In such a case we see a kernel oops with following back-trace: Oops: Kernel access of bad area, sig: 11 [#1] Call Trace: free_irq+0x88/0xd0 (unreliable) cxl_unmap_irq+0x20/0x40 [cxl] cxl_native_release_psl_irq+0x78/0xd8 [cxl] pci_deconfigure_afu+0xac/0x110 [cxl] cxl_remove+0x104/0x210 [cxl] pci_device_remove+0x6c/0x110 device_release_driver_internal+0x204/0x2e0 pci_stop_bus_device+0xa0/0xd0 pci_stop_and_remove_bus_device+0x28/0x40 pci_hp_remove_devices+0xb0/0x150 pci_hp_remove_devices+0x68/0x150 eeh_handle_normal_event+0x140/0x580 eeh_handle_event+0x174/0x360 eeh_event_handler+0x1e8/0x1f0 This patch fixes the issue of double free_irq by checking that variables that hold the virqs (err_hwirq, serr_hwirq, psl_virq) are not '0' before un-mapping and resetting these variables to '0' when they are un-mapped. Cc: stable@vger.kernel.org Signed-off-by: Vaibhav Jain Reviewed-by: Andrew Donnellan Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/cxl/native.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 871a2f09c718..8d6ea9712dbd 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -1302,13 +1302,16 @@ int cxl_native_register_psl_err_irq(struct cxl *adapter) void cxl_native_release_psl_err_irq(struct cxl *adapter) { - if (adapter->native->err_virq != irq_find_mapping(NULL, adapter->native->err_hwirq)) + if (adapter->native->err_virq == 0 || + adapter->native->err_virq != + irq_find_mapping(NULL, adapter->native->err_hwirq)) return; cxl_p1_write(adapter, CXL_PSL_ErrIVTE, 0x0000000000000000); cxl_unmap_irq(adapter->native->err_virq, adapter); cxl_ops->release_one_irq(adapter, adapter->native->err_hwirq); kfree(adapter->irq_name); + adapter->native->err_virq = 0; } int cxl_native_register_serr_irq(struct cxl_afu *afu) @@ -1346,13 +1349,15 @@ int cxl_native_register_serr_irq(struct cxl_afu *afu) void cxl_native_release_serr_irq(struct cxl_afu *afu) { - if (afu->serr_virq != irq_find_mapping(NULL, afu->serr_hwirq)) + if (afu->serr_virq == 0 || + afu->serr_virq != irq_find_mapping(NULL, afu->serr_hwirq)) return; cxl_p1n_write(afu, CXL_PSL_SERR_An, 0x0000000000000000); cxl_unmap_irq(afu->serr_virq, afu); cxl_ops->release_one_irq(afu->adapter, afu->serr_hwirq); kfree(afu->err_irq_name); + afu->serr_virq = 0; } int cxl_native_register_psl_irq(struct cxl_afu *afu) @@ -1375,12 +1380,15 @@ int cxl_native_register_psl_irq(struct cxl_afu *afu) void cxl_native_release_psl_irq(struct cxl_afu *afu) { - if (afu->native->psl_virq != irq_find_mapping(NULL, afu->native->psl_hwirq)) + if (afu->native->psl_virq == 0 || + afu->native->psl_virq != + irq_find_mapping(NULL, afu->native->psl_hwirq)) return; cxl_unmap_irq(afu->native->psl_virq, afu); cxl_ops->release_one_irq(afu->adapter, afu->native->psl_hwirq); kfree(afu->psl_irq_name); + afu->native->psl_virq = 0; } static void recover_psl_err(struct cxl_afu *afu, u64 errstat) From ba4a648f12f4cd0a8003dd229b6ca8a53348ee4b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 6 Jun 2017 20:23:57 +1000 Subject: [PATCH 09/25] powerpc/numa: Fix percpu allocations to be NUMA aware In commit 8c272261194d ("powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"), we switched to the generic implementation of cpu_to_node(), which uses a percpu variable to hold the NUMA node for each CPU. Unfortunately we neglected to notice that we use cpu_to_node() in the allocation of our percpu areas, leading to a chicken and egg problem. In practice what happens is when we are setting up the percpu areas, cpu_to_node() reports that all CPUs are on node 0, so we allocate all percpu areas on node 0. This is visible in the dmesg output, as all pcpu allocs being in group 0: pcpu-alloc: [0] 00 01 02 03 [0] 04 05 06 07 pcpu-alloc: [0] 08 09 10 11 [0] 12 13 14 15 pcpu-alloc: [0] 16 17 18 19 [0] 20 21 22 23 pcpu-alloc: [0] 24 25 26 27 [0] 28 29 30 31 pcpu-alloc: [0] 32 33 34 35 [0] 36 37 38 39 pcpu-alloc: [0] 40 41 42 43 [0] 44 45 46 47 To fix it we need an early_cpu_to_node() which can run prior to percpu being setup. We already have the numa_cpu_lookup_table we can use, so just plumb it in. With the patch dmesg output shows two groups, 0 and 1: pcpu-alloc: [0] 00 01 02 03 [0] 04 05 06 07 pcpu-alloc: [0] 08 09 10 11 [0] 12 13 14 15 pcpu-alloc: [0] 16 17 18 19 [0] 20 21 22 23 pcpu-alloc: [1] 24 25 26 27 [1] 28 29 30 31 pcpu-alloc: [1] 32 33 34 35 [1] 36 37 38 39 pcpu-alloc: [1] 40 41 42 43 [1] 44 45 46 47 We can also check the data_offset in the paca of various CPUs, with the fix we see: CPU 0: data_offset = 0x0ffe8b0000 CPU 24: data_offset = 0x1ffe5b0000 And we can see from dmesg that CPU 24 has an allocation on node 1: node 0: [mem 0x0000000000000000-0x0000000fffffffff] node 1: [mem 0x0000001000000000-0x0000001fffffffff] Cc: stable@vger.kernel.org # v3.16+ Fixes: 8c272261194d ("powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID") Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/topology.h | 14 ++++++++++++++ arch/powerpc/kernel/setup_64.c | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 8b3b46b7b0f2..329771559cbb 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -44,8 +44,22 @@ extern void __init dump_numa_cpu_topology(void); extern int sysfs_add_device_to_node(struct device *dev, int nid); extern void sysfs_remove_device_from_node(struct device *dev, int nid); +static inline int early_cpu_to_node(int cpu) +{ + int nid; + + nid = numa_cpu_lookup_table[cpu]; + + /* + * Fall back to node 0 if nid is unset (it should be, except bugs). + * This allows callers to safely do NODE_DATA(early_cpu_to_node(cpu)). + */ + return (nid < 0) ? 0 : nid; +} #else +static inline int early_cpu_to_node(int cpu) { return 0; } + static inline void dump_numa_cpu_topology(void) {} static inline int sysfs_add_device_to_node(struct device *dev, int nid) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index f35ff9dea4fb..a8c1f99e9607 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -661,7 +661,7 @@ void __init emergency_stack_init(void) static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align, + return __alloc_bootmem_node(NODE_DATA(early_cpu_to_node(cpu)), size, align, __pa(MAX_DMA_ADDRESS)); } @@ -672,7 +672,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size) static int pcpu_cpu_distance(unsigned int from, unsigned int to) { - if (cpu_to_node(from) == cpu_to_node(to)) + if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; From 8c218578fcbbbdb10416c8614658bf32e3bf1655 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Fri, 26 May 2017 13:38:27 +0530 Subject: [PATCH 10/25] powerpc/perf: Fix Power9 test_adder fields Commit 8d911904f3ce4 ('powerpc/perf: Add restrictions to PMC5 in power9 DD1') was added to restrict the use of PMC5 in Power9 DD1. Intention was to disable the use of PMC5 using raw event code. But instead of updating the power9_isa207_pmu structure (used on DD1), the commit incorrectly updated the power9_pmu structure. Fix it. Fixes: 8d911904f3ce ("powerpc/perf: Add restrictions to PMC5 in power9 DD1") Reported-by: Shriya Signed-off-by: Madhavan Srinivasan Tested-by: Shriya Signed-off-by: Michael Ellerman --- arch/powerpc/perf/power9-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 018f8e90ac35..bb28e1a41257 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -402,7 +402,7 @@ static struct power_pmu power9_isa207_pmu = { .name = "POWER9", .n_counter = MAX_PMU_COUNTERS, .add_fields = ISA207_ADD_FIELDS, - .test_adder = ISA207_TEST_ADDER, + .test_adder = P9_DD1_TEST_ADDER, .compute_mmcr = isa207_compute_mmcr, .config_bhrb = power9_config_bhrb, .bhrb_filter_map = power9_bhrb_filter_map, @@ -421,7 +421,7 @@ static struct power_pmu power9_pmu = { .name = "POWER9", .n_counter = MAX_PMU_COUNTERS, .add_fields = ISA207_ADD_FIELDS, - .test_adder = P9_DD1_TEST_ADDER, + .test_adder = ISA207_TEST_ADDER, .compute_mmcr = isa207_compute_mmcr, .config_bhrb = power9_config_bhrb, .bhrb_filter_map = power9_bhrb_filter_map, From cec422c11caeeccae709e9942058b6b644ce434c Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Tue, 6 Jun 2017 11:43:41 +0200 Subject: [PATCH 11/25] cxl: Fix error path on bad ioctl Fix error path if we can't copy user structure on CXL_IOCTL_START_WORK ioctl. We shouldn't unlock the context status mutex as it was not locked (yet). Fixes: 0712dc7e73e5 ("cxl: Fix issues when unmapping contexts") Cc: stable@vger.kernel.org # v3.19+ Signed-off-by: Frederic Barrat Reviewed-by: Vaibhav Jain Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/cxl/file.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 17b433f1ce23..0761271d68c5 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -159,11 +159,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, /* Do this outside the status_mutex to avoid a circular dependency with * the locking in cxl_mmap_fault() */ - if (copy_from_user(&work, uwork, - sizeof(struct cxl_ioctl_start_work))) { - rc = -EFAULT; - goto out; - } + if (copy_from_user(&work, uwork, sizeof(work))) + return -EFAULT; mutex_lock(&ctx->status_mutex); if (ctx->status != OPENED) { From 92d9dfda8b547cc292af27e11e11c9eff3bb574f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 1 Jun 2017 20:05:04 +0530 Subject: [PATCH 12/25] powerpc/mm/4k: Limit 4k page size config to 64TB virtual address space Supporting 512TB requires us to do a order 3 allocation for level 1 page table (pgd). This results in page allocation failures with certain workloads. For now limit 4k linux page size config to 64TB. Fixes: f6eedbba7a26 ("powerpc/mm/hash: Increase VA range to 128TB") Reported-by: Hugh Dickins Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 +- arch/powerpc/include/asm/processor.h | 25 ++++++++++---------- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/mm/mmu_context_book3s64.c | 2 +- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index b4b5e6b671ca..0c4e470571ca 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -8,7 +8,7 @@ #define H_PTE_INDEX_SIZE 9 #define H_PMD_INDEX_SIZE 7 #define H_PUD_INDEX_SIZE 9 -#define H_PGD_INDEX_SIZE 12 +#define H_PGD_INDEX_SIZE 9 #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a2123f291ab0..bb99b651085a 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -110,13 +110,18 @@ void release_thread(struct task_struct *); #define TASK_SIZE_128TB (0x0000800000000000UL) #define TASK_SIZE_512TB (0x0002000000000000UL) -#ifdef CONFIG_PPC_BOOK3S_64 +/* + * For now 512TB is only supported with book3s and 64K linux page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) /* * Max value currently used: */ -#define TASK_SIZE_USER64 TASK_SIZE_512TB +#define TASK_SIZE_USER64 TASK_SIZE_512TB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB #else -#define TASK_SIZE_USER64 TASK_SIZE_64TB +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB #endif /* @@ -132,7 +137,7 @@ void release_thread(struct task_struct *); * space during mmap's. */ #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) -#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4)) +#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) #define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) @@ -143,21 +148,15 @@ void release_thread(struct task_struct *); * with 128TB and conditionally enable upto 512TB */ #ifdef CONFIG_PPC_BOOK3S_64 -#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ - TASK_SIZE_USER32 : TASK_SIZE_128TB) +#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ + TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) #else #define DEFAULT_MAP_WINDOW TASK_SIZE #endif #ifdef __powerpc64__ -#ifdef CONFIG_PPC_BOOK3S_64 -/* Limit stack to 128TB */ -#define STACK_TOP_USER64 TASK_SIZE_128TB -#else -#define STACK_TOP_USER64 TASK_SIZE_USER64 -#endif - +#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 #define STACK_TOP_USER32 TASK_SIZE_USER32 #define STACK_TOP (is_32bit_task() ? \ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 71dcda91755d..857129acf960 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -928,7 +928,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PPC_MM_SLICES #ifdef CONFIG_PPC64 - init_mm.context.addr_limit = TASK_SIZE_128TB; + init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64; #else #error "context.addr_limit not initialized." #endif diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index c6dca2ae78ef..a3edf813d455 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -99,7 +99,7 @@ static int hash__init_new_context(struct mm_struct *mm) * mm->context.addr_limit. Default to max task size so that we copy the * default values to paca which will help us to handle slb miss early. */ - mm->context.addr_limit = TASK_SIZE_128TB; + mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64; /* * The old code would re-promote on fork, we don't do that when using From c6ee9619e2edd9912316f7e2eaf9ffa14fafe9f9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 8 Jun 2017 16:29:59 +1000 Subject: [PATCH 13/25] powerpc/book3s64: Move PPC_DT_CPU_FTRs and enable it by default The PPC_DT_CPU_FTRs is a bit misplaced in menuconfig, it shows up with other general kernel options. It's really more at home in the "Platform Support" section, so move it there. Also enable it by default, for Book3s 64. It does mostly nothing unless the device tree properties are found, and we will want it enabled eventually in distro kernels, so turn it on to start getting more testing. Fixes: 5a61ef74f269 ("powerpc/64s: Support new device tree binding for discovering CPU features") Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 11 ----------- arch/powerpc/platforms/Kconfig | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4a4a05afcaf7..964da1891ea9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -380,17 +380,6 @@ source "arch/powerpc/platforms/Kconfig" menu "Kernel options" -config PPC_DT_CPU_FTRS - bool "Device-tree based CPU feature discovery & setup" - depends on PPC_BOOK3S_64 - default n - help - This enables code to use a new device tree binding for describing CPU - compatibility and features. Saying Y here will attempt to use the new - binding if the firmware provides it. Currently only the skiboot - firmware provides this binding. - If you're not sure say Y. - config HIGHMEM bool "High memory support" depends on PPC32 diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 33244e3d9375..4fd64d3f5c44 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -59,6 +59,17 @@ config PPC_OF_BOOT_TRAMPOLINE In case of doubt, say Y +config PPC_DT_CPU_FTRS + bool "Device-tree based CPU feature discovery & setup" + depends on PPC_BOOK3S_64 + default y + help + This enables code to use a new device tree binding for describing CPU + compatibility and features. Saying Y here will attempt to use the new + binding if the firmware provides it. Currently only the skiboot + firmware provides this binding. + If you're not sure say Y. + config UDBG_RTAS_CONSOLE bool "RTAS based debug console" depends on PPC_RTAS From 377aa6b0efbaa29cfeecd8b9244641217f9544ca Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 14 Jun 2017 14:47:50 +1000 Subject: [PATCH 14/25] powerpc/npu-dma: Remove spurious WARN_ON when a PCI device has no of_node Commit 4c3b89effc28 ("powerpc/powernv: Add sanity checks to pnv_pci_get_{gpu|npu}_dev") introduced explicit warnings in pnv_pci_get_npu_dev() when a PCIe device has no associated device-tree node. However not all PCIe devices have an of_node and pnv_pci_get_npu_dev() gets indirectly called at least once for every PCIe device in the system. This results in spurious WARN_ON()'s so remove it. The same situation should not exist for pnv_pci_get_gpu_dev() as any NPU based PCIe device requires a device-tree node. Fixes: 4c3b89effc28 ("powerpc/powernv: Add sanity checks to pnv_pci_get_{gpu|npu}_dev") Reported-by: Alexey Kardashevskiy Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 78fa9395b8c5..e6f444b46207 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -75,7 +75,8 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) if (WARN_ON(!gpdev)) return NULL; - if (WARN_ON(!gpdev->dev.of_node)) + /* Not all PCI devices have device-tree nodes */ + if (!gpdev->dev.of_node) return NULL; /* Get assoicated PCI device */ From 25642705b2359a705784bbbf1655c25a8f8efde2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 14 Jun 2017 10:19:25 +1000 Subject: [PATCH 15/25] powerpc/xive: Fix offset for store EOI MMIOs Architecturally we should apply a 0x400 offset for these. Not doing it will break future HW implementations. The offset of 0 is supposed to remain for "triggers" though not all sources support both trigger and store EOI, and in P9 specifically, some sources will treat 0 as a store EOI. But future chips will not. So this makes us use the properly architected offset which should work always. Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller") Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/xive.h | 12 +++++++----- arch/powerpc/kvm/book3s_xive_template.c | 4 ++-- arch/powerpc/sysdev/xive/common.c | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index c8a822acf962..c23ff4389ca2 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -94,11 +94,13 @@ struct xive_q { * store at 0 and some ESBs support doing a trigger via a * separate trigger page. */ -#define XIVE_ESB_GET 0x800 -#define XIVE_ESB_SET_PQ_00 0xc00 -#define XIVE_ESB_SET_PQ_01 0xd00 -#define XIVE_ESB_SET_PQ_10 0xe00 -#define XIVE_ESB_SET_PQ_11 0xf00 +#define XIVE_ESB_STORE_EOI 0x400 /* Store */ +#define XIVE_ESB_LOAD_EOI 0x000 /* Load */ +#define XIVE_ESB_GET 0x800 /* Load */ +#define XIVE_ESB_SET_PQ_00 0xc00 /* Load */ +#define XIVE_ESB_SET_PQ_01 0xd00 /* Load */ +#define XIVE_ESB_SET_PQ_10 0xe00 /* Load */ +#define XIVE_ESB_SET_PQ_11 0xf00 /* Load */ #define XIVE_ESB_VAL_P 0x2 #define XIVE_ESB_VAL_Q 0x1 diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 023a31133c37..4636ca6e7d38 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -69,7 +69,7 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) { /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - __x_writeq(0, __x_eoi_page(xd)); + __x_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { opal_int_eoi(hw_irq); } else { @@ -89,7 +89,7 @@ static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) * properly. */ if (xd->flags & XIVE_IRQ_FLAG_LSI) - __x_readq(__x_eoi_page(xd)); + __x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); else { eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 913825086b8d..8f5e3035483b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -297,7 +297,7 @@ void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) { /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - out_be64(xd->eoi_mmio, 0); + out_be64(xd->eoi_mmio + XIVE_ESB_STORE_EOI, 0); else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { /* * The FW told us to call it. This happens for some From a093c92dc7f96a15de98ec8cfe38e6f7610a5969 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 14 Jun 2017 13:01:25 +1000 Subject: [PATCH 16/25] powerpc/debug: Add missing warn flag to WARN_ON's non-builtin path When trapped on WARN_ON(), report_bug() is expected to return BUG_TRAP_TYPE_WARN so the caller will increment NIP by 4 and continue. The __builtin_constant_p() path of the PPC's WARN_ON() calls (indirectly) __WARN_FLAGS() which has BUGFLAG_WARNING set, however the other branch does not which makes report_bug() report a bug rather than a warning. Fixes: f26dee15103f ("debug: Avoid setting BUGFLAG_WARNING twice") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index f2c562a0a427..0151af6c2a50 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -104,7 +104,7 @@ "1: "PPC_TLNEI" %4,0\n" \ _EMIT_BUG_ENTRY \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_TAINT(TAINT_WARN)), \ + "i" (BUGFLAG_WARNING|BUGFLAG_TAINT(TAINT_WARN)),\ "i" (sizeof(struct bug_entry)), \ "r" (__ret_warn_on)); \ } \ From a9f8553e935f26cb5447f67e280946b0923cd2dc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:15 +0530 Subject: [PATCH 17/25] powerpc/kprobes: Pause function_graph tracing during jprobes handling This fixes a crash when function_graph and jprobes are used together. This is essentially commit 237d28db036e ("ftrace/jprobes/x86: Fix conflict between jprobes and function graph tracing"), but for powerpc. Jprobes breaks function_graph tracing since the jprobe hook needs to use jprobe_return(), which never returns back to the hook, but instead to the original jprobe'd function. The solution is to momentarily pause function_graph tracing before invoking the jprobe hook and re-enable it when returning back to the original jprobe'd function. Fixes: 6794c78243bf ("powerpc64: port of the function graph tracer") Cc: stable@vger.kernel.org # v2.6.30+ Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index fc4343514bed..5075a4d6f1d7 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -617,6 +617,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); #endif + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); + return 1; } NOKPROBE_SYMBOL(setjmp_pre_handler); @@ -642,6 +651,8 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) * saved regs... */ memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); preempt_enable_no_resched(); return 1; } From a4979a7e71eb8da976cbe4a0a1fa50636e76b04f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:16 +0530 Subject: [PATCH 18/25] powerpc/ftrace: Pass the correct stack pointer for DYNAMIC_FTRACE_WITH_REGS For DYNAMIC_FTRACE_WITH_REGS, we should be passing-in the original set of registers in pt_regs, to capture the state _before_ ftrace_caller. However, we are instead passing the stack pointer *after* allocating a stack frame in ftrace_caller. Fix this by saving the proper value of r1 in pt_regs. Also, use SAVE_10GPRS() to simplify the code. Fixes: 153086644fd1 ("powerpc/ftrace: Add support for -mprofile-kernel ftrace ABI") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 7c933a99f5d5..fa0921410fa4 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -45,10 +45,14 @@ _GLOBAL(ftrace_caller) stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save all gprs to pt_regs */ - SAVE_8GPRS(0,r1) - SAVE_8GPRS(8,r1) - SAVE_8GPRS(16,r1) - SAVE_8GPRS(24,r1) + SAVE_GPR(0, r1) + SAVE_10GPRS(2, r1) + SAVE_10GPRS(12, r1) + SAVE_10GPRS(22, r1) + + /* Save previous stack pointer (r1) */ + addi r8, r1, SWITCH_FRAME_SIZE + std r8, GPR1(r1) /* Load special regs for save below */ mfmsr r8 @@ -103,10 +107,10 @@ ftrace_call: #endif /* Restore gprs */ - REST_8GPRS(0,r1) - REST_8GPRS(8,r1) - REST_8GPRS(16,r1) - REST_8GPRS(24,r1) + REST_GPR(0,r1) + REST_10GPRS(2,r1) + REST_10GPRS(12,r1) + REST_10GPRS(22,r1) /* Restore possibly modified LR */ ld r0, _LINK(r1) From c05b8c4474c03026aaa7f8872e78369f69f1bb08 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:17 +0530 Subject: [PATCH 19/25] powerpc/kprobes: Skip livepatch_handler() for jprobes ftrace_caller() depends on a modified regs->nip to detect if a certain function has been livepatched. However, with KPROBES_ON_FTRACE, it is possible for regs->nip to have been modified by the kprobes pre_handler (jprobes, for instance). In this case, we do not want to invoke the livepatch_handler so as not to consume the livepatch stack. To distinguish between the two (kprobes and livepatch), we check if there is an active kprobe on the current function. If there is, then we know for sure that it must have modified the NIP as we don't support livepatching a kprobe'd function. In this case, we simply skip the livepatch_handler and branch to the new NIP. Otherwise, the livepatch_handler is invoked. Fixes: ead514d5fb30 ("powerpc/kprobes: Add support for KPROBES_ON_FTRACE") Signed-off-by: Naveen N. Rao Reviewed-by: Masami Hiramatsu Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kprobes.h | 1 + arch/powerpc/kernel/kprobes.c | 6 +++ .../powerpc/kernel/trace/ftrace_64_mprofile.S | 39 ++++++++++++++++--- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index a83821f33ea3..8814a7249ceb 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -103,6 +103,7 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_handler(struct pt_regs *regs); extern int kprobe_post_handler(struct pt_regs *regs); +extern int is_current_kprobe_addr(unsigned long addr); #ifdef CONFIG_KPROBES_ON_FTRACE extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 5075a4d6f1d7..01addfb0ed0a 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,6 +43,12 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; +int is_current_kprobe_addr(unsigned long addr) +{ + struct kprobe *p = kprobe_running(); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; +} + bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index fa0921410fa4..c98e90b4ea7b 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -99,13 +99,39 @@ ftrace_call: bl ftrace_stub nop - /* Load ctr with the possibly modified NIP */ - ld r3, _NIP(r1) - mtctr r3 + /* Load the possibly modified NIP */ + ld r15, _NIP(r1) + #ifdef CONFIG_LIVEPATCH - cmpd r14,r3 /* has NIP been altered? */ + cmpd r14, r15 /* has NIP been altered? */ #endif +#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_KPROBES_ON_FTRACE) + /* NIP has not been altered, skip over further checks */ + beq 1f + + /* Check if there is an active kprobe on us */ + subi r3, r14, 4 + bl is_current_kprobe_addr + nop + + /* + * If r3 == 1, then this is a kprobe/jprobe. + * else, this is livepatched function. + * + * The conditional branch for livepatch_handler below will use the + * result of this comparison. For kprobe/jprobe, we just need to branch to + * the new NIP, not call livepatch_handler. The branch below is bne, so we + * want CR0[EQ] to be true if this is a kprobe/jprobe. Which means we want + * CR0[EQ] = (r3 == 1). + */ + cmpdi r3, 1 +1: +#endif + + /* Load CTR with the possibly modified NIP */ + mtctr r15 + /* Restore gprs */ REST_GPR(0,r1) REST_10GPRS(2,r1) @@ -123,7 +149,10 @@ ftrace_call: addi r1, r1, SWITCH_FRAME_SIZE #ifdef CONFIG_LIVEPATCH - /* Based on the cmpd above, if the NIP was altered handle livepatch */ + /* + * Based on the cmpd or cmpdi above, if the NIP was altered and we're + * not on a kprobe/jprobe, then handle livepatch. + */ bne- livepatch_handler #endif From d89ba5353f301971dd7d2f9fdf25c4432728f38e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 14 Jun 2017 00:12:00 +0530 Subject: [PATCH 20/25] powerpc/64s: Handle data breakpoints in Radix mode On Power9, trying to use data breakpoints throws the splat shown below. This is because the check for a data breakpoint in DSISR is in do_hash_page(), which is not called when in Radix mode. Unable to handle kernel paging request for data at address 0xc000000000e19218 Faulting instruction address: 0xc0000000001155e8 cpu 0x0: Vector: 300 (Data Access) at [c0000000ef1e7b20] pc: c0000000001155e8: find_pid_ns+0x48/0xe0 lr: c000000000116ac4: find_task_by_vpid+0x44/0x90 sp: c0000000ef1e7da0 msr: 9000000000009033 dar: c000000000e19218 dsisr: 400000 Move the check to handle_page_fault() so as to catch data breakpoints in both Hash and Radix MMU modes. We have to change the check in do_hash_page() against 0xa410 to use 0xa450, so as to include the value of (DSISR_DABRMATCH << 16). There are two sites that call handle_page_fault() when in Radix, both already pass DSISR in r4. Fixes: caca285e5ab4 ("powerpc/mm/radix: Use STD_MMU_64 to properly isolate hash related code") Cc: stable@vger.kernel.org # v4.7+ Reported-by: Shriya R. Kulkarni Signed-off-by: Naveen N. Rao [mpe: Fix the fall-through case on hash, we need to reload DSISR] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ae418b85c17c..b886795060fd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1411,10 +1411,8 @@ USE_TEXT_SECTION() .balign IFETCH_ALIGN_BYTES do_hash_page: #ifdef CONFIG_PPC_STD_MMU_64 - andis. r0,r4,0xa410 /* weird error? */ + andis. r0,r4,0xa450 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ - andis. r0,r4,DSISR_DABRMATCH@h - bne- handle_dabr_fault CURRENT_THREAD_INFO(r11, r1) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ @@ -1438,11 +1436,16 @@ do_hash_page: /* Error */ blt- 13f + + /* Reload DSISR into r4 for the DABR check below */ + ld r4,_DSISR(r1) #endif /* CONFIG_PPC_STD_MMU_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: -11: ld r4,_DAR(r1) +11: andis. r0,r4,DSISR_DABRMATCH@h + bne- handle_dabr_fault + ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault From bf05fc25f268cd62f147f368fe65ad3e5b04fe9f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Jun 2017 19:16:48 +0530 Subject: [PATCH 21/25] powerpc/perf: Fix oops when kthread execs user process When a kthread calls call_usermodehelper() the steps are: 1. allocate current->mm 2. load_elf_binary() 3. populate current->thread.regs While doing this, interrupts are not disabled. If there is a perf interrupt in the middle of this process (i.e. step 1 has completed but not yet reached to step 3) and if perf tries to read userspace regs, kernel oops with following log: Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc0000000000da0fc ... Call Trace: perf_output_sample_regs+0x6c/0xd0 perf_output_sample+0x4e4/0x830 perf_event_output_forward+0x64/0x90 __perf_event_overflow+0x8c/0x1e0 record_and_restart+0x220/0x5c0 perf_event_interrupt+0x2d8/0x4d0 performance_monitor_exception+0x54/0x70 performance_monitor_common+0x158/0x160 --- interrupt: f01 at avtab_search_node+0x150/0x1a0 LR = avtab_search_node+0x100/0x1a0 ... load_elf_binary+0x6e8/0x15a0 search_binary_handler+0xe8/0x290 do_execveat_common.isra.14+0x5f4/0x840 call_usermodehelper_exec_async+0x170/0x210 ret_from_kernel_thread+0x5c/0x7c Fix it by setting abi to PERF_SAMPLE_REGS_ABI_NONE when userspace pt_regs are not set. Fixes: ed4a4ef85cf5 ("powerpc/perf: Add support for sampling interrupt register state") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Ravi Bangoria Acked-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/perf/perf_regs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c index cbd82fde5770..09ceea6175ba 100644 --- a/arch/powerpc/perf/perf_regs.c +++ b/arch/powerpc/perf/perf_regs.c @@ -101,5 +101,6 @@ void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs_user_copy) { regs_user->regs = task_pt_regs(current); - regs_user->abi = perf_reg_abi(current); + regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) : + PERF_SAMPLE_REGS_ABI_NONE; } From bbd5ff50afffcf4a01d05367524736c57607a478 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 20 Jun 2017 18:37:28 +1000 Subject: [PATCH 22/25] powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD NPU2 requires an extra explicit flush to an active GPU PID when sending address translation shoot downs (ATSDs) to reliably flush the GPU TLB. This patch adds just such a flush at the end of each sequence of ATSDs. We can safely use PID 0 which is always reserved and active on the GPU. PID 0 is only used for init_mm which will never be a user mm on the GPU. To enforce this we add a check in pnv_npu2_init_context() just in case someone tries to use PID 0 on the GPU. Signed-off-by: Alistair Popple [mpe: Use true/false for bool literals] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 94 ++++++++++++++++-------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index e6f444b46207..b5d960d6db3d 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -449,7 +449,7 @@ static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, return mmio_atsd_reg; } -static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) +static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) { unsigned long launch; @@ -465,12 +465,15 @@ static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) /* PID */ launch |= pid << PPC_BITLSHIFT(38); + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + /* Invalidating the entire process doesn't use a va */ return mmio_launch_invalidate(npu, launch, 0); } static int mmio_invalidate_va(struct npu *npu, unsigned long va, - unsigned long pid) + unsigned long pid, bool flush) { unsigned long launch; @@ -486,26 +489,60 @@ static int mmio_invalidate_va(struct npu *npu, unsigned long va, /* PID */ launch |= pid << PPC_BITLSHIFT(38); + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + return mmio_launch_invalidate(npu, launch, va); } #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) +struct mmio_atsd_reg { + struct npu *npu; + int reg; +}; + +static void mmio_invalidate_wait( + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) +{ + struct npu *npu; + int i, reg; + + /* Wait for all invalidations to complete */ + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* Wait for completion */ + npu = mmio_atsd_reg[i].npu; + reg = mmio_atsd_reg[i].reg; + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) + cpu_relax(); + + put_mmio_atsd_reg(npu, reg); + + /* + * The GPU requires two flush ATSDs to ensure all entries have + * been flushed. We use PID 0 as it will never be used for a + * process on the GPU. + */ + if (flush) + mmio_invalidate_pid(npu, 0, true); + } +} + /* * Invalidate either a single address or an entire PID depending on * the value of va. */ static void mmio_invalidate(struct npu_context *npu_context, int va, - unsigned long address) + unsigned long address, bool flush) { - int i, j, reg; + int i, j; struct npu *npu; struct pnv_phb *nphb; struct pci_dev *npdev; - struct { - struct npu *npu; - int reg; - } mmio_atsd_reg[NV_MAX_NPUS]; + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; /* @@ -525,10 +562,11 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, if (va) mmio_atsd_reg[i].reg = - mmio_invalidate_va(npu, address, pid); + mmio_invalidate_va(npu, address, pid, + flush); else mmio_atsd_reg[i].reg = - mmio_invalidate_pid(npu, pid); + mmio_invalidate_pid(npu, pid, flush); /* * The NPU hardware forwards the shootdown to all GPUs @@ -544,18 +582,10 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, */ flush_tlb_mm(npu_context->mm); - /* Wait for all invalidations to complete */ - for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) - continue; - - /* Wait for completion */ - npu = mmio_atsd_reg[i].npu; - reg = mmio_atsd_reg[i].reg; - while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) - cpu_relax(); - put_mmio_atsd_reg(npu, reg); - } + mmio_invalidate_wait(mmio_atsd_reg, flush); + if (flush) + /* Wait for the flush to complete */ + mmio_invalidate_wait(mmio_atsd_reg, false); } static void pnv_npu2_mn_release(struct mmu_notifier *mn, @@ -571,7 +601,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, * There should be no more translation requests for this PID, but we * need to ensure any entries for it are removed from the TLB. */ - mmio_invalidate(npu_context, 0, 0); + mmio_invalidate(npu_context, 0, 0, true); } static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, @@ -581,7 +611,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, { struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, 1, address); + mmio_invalidate(npu_context, 1, address, true); } static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, @@ -590,7 +620,7 @@ static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, { struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, 1, address); + mmio_invalidate(npu_context, 1, address, true); } static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, @@ -600,8 +630,11 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct npu_context *npu_context = mn_to_npu_context(mn); unsigned long address; - for (address = start; address <= end; address += PAGE_SIZE) - mmio_invalidate(npu_context, 1, address); + for (address = start; address < end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address, false); + + /* Do the flush only on the final addess == end */ + mmio_invalidate(npu_context, 1, address, true); } static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { @@ -651,8 +684,11 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, /* No nvlink associated with this GPU device */ return ERR_PTR(-ENODEV); - if (!mm) { - /* kernel thread contexts are not supported */ + if (!mm || mm->context.id == 0) { + /* + * Kernel thread contexts are not supported and context id 0 is + * reserved on the GPU. + */ return ERR_PTR(-EINVAL); } From 34f19ff1b5a0d11e46df479623d6936460105c9f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Jun 2017 15:58:29 +1000 Subject: [PATCH 23/25] powerpc/64: Initialise thread_info for emergency stacks Emergency stacks have their thread_info mostly uninitialised, which in particular means garbage preempt_count values. Emergency stack code runs with interrupts disabled entirely, and is used very rarely, so this has been unnoticed so far. It was found by a proposed new powerpc watchdog that takes a soft-NMI directly from the masked_interrupt handler and using the emergency stack. That crashed at BUG_ON(in_nmi()) in nmi_enter(). preempt_count()s were found to be garbage. To fix this, zero the entire THREAD_SIZE allocation, and initialize the thread_info. Cc: stable@vger.kernel.org Reported-by: Abdul Haleem Signed-off-by: Nicholas Piggin [mpe: Move it all into setup_64.c, use a function not a macro. Fix crashes on Cell by setting preempt_count to 0 not HARDIRQ_OFFSET] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a8c1f99e9607..4640f6d64f8b 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -615,6 +615,24 @@ void __init exc_lvl_early_init(void) } #endif +/* + * Emergency stacks are used for a range of things, from asynchronous + * NMIs (system reset, machine check) to synchronous, process context. + * We set preempt_count to zero, even though that isn't necessarily correct. To + * get the right value we'd need to copy it from the previous thread_info, but + * doing that might fault causing more problems. + * TODO: what to do with accounting? + */ +static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu) +{ + ti->task = NULL; + ti->cpu = cpu; + ti->preempt_count = 0; + ti->local_flags = 0; + ti->flags = 0; + klp_init_thread_info(ti); +} + /* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. Exclusive emergency @@ -633,24 +651,31 @@ void __init emergency_stack_init(void) * Since we use these as temporary stacks during secondary CPU * bringup, we need to get at them in real mode. This means they * must also be within the RMO region. + * + * The IRQ stacks allocated elsewhere in this file are zeroed and + * initialized in kernel/irq.c. These are initialized here in order + * to have emergency stacks available as early as possible. */ limit = min(safe_stack_limit(), ppc64_rma_size); for_each_possible_cpu(i) { struct thread_info *ti; ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); - klp_init_thread_info(ti); + memset(ti, 0, THREAD_SIZE); + emerg_stack_init_thread_info(ti, i); paca[i].emergency_sp = (void *)ti + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); - klp_init_thread_info(ti); + memset(ti, 0, THREAD_SIZE); + emerg_stack_init_thread_info(ti, i); paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE; /* emergency stack for machine check exception handling. */ ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); - klp_init_thread_info(ti); + memset(ti, 0, THREAD_SIZE); + emerg_stack_init_thread_info(ti, i); paca[i].mc_emergency_sp = (void *)ti + THREAD_SIZE; #endif } From 797625deaedd9a0621376817db2813244b3246e3 Mon Sep 17 00:00:00 2001 From: Christophe Lombard Date: Tue, 13 Jun 2017 17:41:05 +0200 Subject: [PATCH 24/25] cxl: Fixes for Coherent Accelerator Interface Architecture 2.0 A previous set of patches "cxl: Add support for Coherent Accelerator Interface Architecture 2.0" has introduced a new support for the CAPI cards. These patches have been tested on Simulation environment and quite a bit of them have been tested on real hardware. This patch brings new fixes after a series of tests carried out on new equipment: - Add POWER9 definition. - Re-enable any masked interrupts when the AFU is not activated after resetting the AFU. - Remove the api cxl_is_psl8/9 which is no longer useful. - Do not dump CAPI1 registers. - Rewrite cxl_is_page_fault() function. - Do not register slb callack on P9. Fixes: f24be42aab37 ("cxl: Add psl9 specific code") Signed-off-by: Christophe Lombard Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/cxl/context.c | 6 +++--- drivers/misc/cxl/cxl.h | 18 +++++------------- drivers/misc/cxl/fault.c | 23 +++++++++++++++-------- drivers/misc/cxl/main.c | 17 +++++++++++++---- drivers/misc/cxl/native.c | 29 +++++++++++++++++------------ drivers/misc/cxl/pci.c | 11 ++++------- 6 files changed, 57 insertions(+), 47 deletions(-) diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 4472ce11f98d..8c32040b9c09 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -45,7 +45,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) mutex_init(&ctx->mapping_lock); ctx->mapping = NULL; - if (cxl_is_psl8(afu)) { + if (cxl_is_power8()) { spin_lock_init(&ctx->sste_lock); /* @@ -189,7 +189,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma) if (start + len > ctx->afu->adapter->ps_size) return -EINVAL; - if (cxl_is_psl9(ctx->afu)) { + if (cxl_is_power9()) { /* * Make sure there is a valid problem state * area space for this AFU. @@ -324,7 +324,7 @@ static void reclaim_ctx(struct rcu_head *rcu) { struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu); - if (cxl_is_psl8(ctx->afu)) + if (cxl_is_power8()) free_page((u64)ctx->sstp); if (ctx->ff_page) __free_page(ctx->ff_page); diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index c8568ea7c518..a03f8e7535e5 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -357,6 +357,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; #define CXL_PSL9_DSISR_An_PF_RGP 0x0000000000000090ULL /* PTE not found (Radix Guest (parent)) 0b10010000 */ #define CXL_PSL9_DSISR_An_PF_HRH 0x0000000000000094ULL /* PTE not found (HPT/Radix Host) 0b10010100 */ #define CXL_PSL9_DSISR_An_PF_STEG 0x000000000000009CULL /* PTE not found (STEG VA) 0b10011100 */ +#define CXL_PSL9_DSISR_An_URTCH 0x00000000000000B4ULL /* Unsupported Radix Tree Configuration 0b10110100 */ /****** CXL_PSL_TFC_An ******************************************************/ #define CXL_PSL_TFC_An_A (1ull << (63-28)) /* Acknowledge non-translation fault */ @@ -844,24 +845,15 @@ static inline bool cxl_is_power8(void) static inline bool cxl_is_power9(void) { - /* intermediate solution */ - if (!cxl_is_power8() && - (cpu_has_feature(CPU_FTRS_POWER9) || - cpu_has_feature(CPU_FTR_POWER9_DD1))) + if (pvr_version_is(PVR_POWER9)) return true; return false; } -static inline bool cxl_is_psl8(struct cxl_afu *afu) +static inline bool cxl_is_power9_dd1(void) { - if (afu->adapter->caia_major == 1) - return true; - return false; -} - -static inline bool cxl_is_psl9(struct cxl_afu *afu) -{ - if (afu->adapter->caia_major == 2) + if ((pvr_version_is(PVR_POWER9)) && + cpu_has_feature(CPU_FTR_POWER9_DD1)) return true; return false; } diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 5344448f514e..c79e39bad7a4 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -187,7 +187,7 @@ static struct mm_struct *get_mem_context(struct cxl_context *ctx) static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr) { - if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DS)) + if ((cxl_is_power8() && (dsisr & CXL_PSL_DSISR_An_DS))) return true; return false; @@ -195,15 +195,22 @@ static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr) static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr) { - if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DM)) + u64 crs; /* Translation Checkout Response Status */ + + if ((cxl_is_power8()) && (dsisr & CXL_PSL_DSISR_An_DM)) return true; - if ((cxl_is_psl9(ctx->afu)) && - ((dsisr & CXL_PSL9_DSISR_An_CO_MASK) & - (CXL_PSL9_DSISR_An_PF_SLR | CXL_PSL9_DSISR_An_PF_RGC | - CXL_PSL9_DSISR_An_PF_RGP | CXL_PSL9_DSISR_An_PF_HRH | - CXL_PSL9_DSISR_An_PF_STEG))) - return true; + if (cxl_is_power9()) { + crs = (dsisr & CXL_PSL9_DSISR_An_CO_MASK); + if ((crs == CXL_PSL9_DSISR_An_PF_SLR) || + (crs == CXL_PSL9_DSISR_An_PF_RGC) || + (crs == CXL_PSL9_DSISR_An_PF_RGP) || + (crs == CXL_PSL9_DSISR_An_PF_HRH) || + (crs == CXL_PSL9_DSISR_An_PF_STEG) || + (crs == CXL_PSL9_DSISR_An_URTCH)) { + return true; + } + } return false; } diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c index 1703655072b1..c1ba0d42cbc8 100644 --- a/drivers/misc/cxl/main.c +++ b/drivers/misc/cxl/main.c @@ -329,8 +329,15 @@ static int __init init_cxl(void) cxl_debugfs_init(); - if ((rc = register_cxl_calls(&cxl_calls))) - goto err; + /* + * we don't register the callback on P9. slb callack is only + * used for the PSL8 MMU and CX4. + */ + if (cxl_is_power8()) { + rc = register_cxl_calls(&cxl_calls); + if (rc) + goto err; + } if (cpu_has_feature(CPU_FTR_HVMODE)) { cxl_ops = &cxl_native_ops; @@ -347,7 +354,8 @@ static int __init init_cxl(void) return 0; err1: - unregister_cxl_calls(&cxl_calls); + if (cxl_is_power8()) + unregister_cxl_calls(&cxl_calls); err: cxl_debugfs_exit(); cxl_file_exit(); @@ -366,7 +374,8 @@ static void exit_cxl(void) cxl_debugfs_exit(); cxl_file_exit(); - unregister_cxl_calls(&cxl_calls); + if (cxl_is_power8()) + unregister_cxl_calls(&cxl_calls); idr_destroy(&cxl_adapter_idr); } diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 8d6ea9712dbd..2b2f8894149d 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -105,11 +105,16 @@ static int native_afu_reset(struct cxl_afu *afu) CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK, false); - /* Re-enable any masked interrupts */ - serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); - serr &= ~CXL_PSL_SERR_An_IRQ_MASKS; - cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); - + /* + * Re-enable any masked interrupts when the AFU is not + * activated to avoid side effects after attaching a process + * in dedicated mode. + */ + if (afu->current_mode == 0) { + serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); + serr &= ~CXL_PSL_SERR_An_IRQ_MASKS; + cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); + } return rc; } @@ -139,9 +144,9 @@ int cxl_psl_purge(struct cxl_afu *afu) pr_devel("PSL purge request\n"); - if (cxl_is_psl8(afu)) + if (cxl_is_power8()) trans_fault = CXL_PSL_DSISR_TRANS; - if (cxl_is_psl9(afu)) + if (cxl_is_power9()) trans_fault = CXL_PSL9_DSISR_An_TF; if (!cxl_ops->link_ok(afu->adapter, afu)) { @@ -603,7 +608,7 @@ static u64 calculate_sr(struct cxl_context *ctx) if (!test_tsk_thread_flag(current, TIF_32BIT)) sr |= CXL_PSL_SR_An_SF; } - if (cxl_is_psl9(ctx->afu)) { + if (cxl_is_power9()) { if (radix_enabled()) sr |= CXL_PSL_SR_An_XLAT_ror; else @@ -1117,10 +1122,10 @@ static irqreturn_t native_handle_psl_slice_error(struct cxl_context *ctx, static bool cxl_is_translation_fault(struct cxl_afu *afu, u64 dsisr) { - if ((cxl_is_psl8(afu)) && (dsisr & CXL_PSL_DSISR_TRANS)) + if ((cxl_is_power8()) && (dsisr & CXL_PSL_DSISR_TRANS)) return true; - if ((cxl_is_psl9(afu)) && (dsisr & CXL_PSL9_DSISR_An_TF)) + if ((cxl_is_power9()) && (dsisr & CXL_PSL9_DSISR_An_TF)) return true; return false; @@ -1194,10 +1199,10 @@ static void native_irq_wait(struct cxl_context *ctx) if (ph != ctx->pe) return; dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An); - if (cxl_is_psl8(ctx->afu) && + if (cxl_is_power8() && ((dsisr & CXL_PSL_DSISR_PENDING) == 0)) return; - if (cxl_is_psl9(ctx->afu) && + if (cxl_is_power9() && ((dsisr & CXL_PSL9_DSISR_PENDING) == 0)) return; /* diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 6dc1ee5b92c9..1eb9859809bf 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -436,7 +436,7 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci /* nMMU_ID Defaults to: b’000001001’*/ xsl_dsnctl |= ((u64)0x09 << (63-28)); - if (cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)) { + if (!(cxl_is_power9_dd1())) { /* * Used to identify CAPI packets which should be sorted into * the Non-Blocking queues by the PHB. This field should match @@ -491,7 +491,7 @@ static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL); /* Disable vc dd1 fix */ - if ((cxl_is_power9() && cpu_has_feature(CPU_FTR_POWER9_DD1))) + if (cxl_is_power9_dd1()) cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL); return 0; @@ -1439,8 +1439,7 @@ int cxl_pci_reset(struct cxl *adapter) * The adapter is about to be reset, so ignore errors. * Not supported on P9 DD1 */ - if ((cxl_is_power8()) || - ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)))) + if ((cxl_is_power8()) || (!(cxl_is_power9_dd1()))) cxl_data_cache_flush(adapter); /* pcie_warm_reset requests a fundamental pci reset which includes a @@ -1750,7 +1749,6 @@ static const struct cxl_service_layer_ops psl9_ops = { .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9, .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9, .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9, - .err_irq_dump_registers = cxl_native_err_irq_dump_regs, .debugfs_stop_trace = cxl_stop_trace_psl9, .write_timebase_ctrl = write_timebase_ctrl_psl9, .timebase_read = timebase_read_psl9, @@ -1889,8 +1887,7 @@ static void cxl_pci_remove_adapter(struct cxl *adapter) * Flush adapter datacache as its about to be removed. * Not supported on P9 DD1. */ - if ((cxl_is_power8()) || - ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)))) + if ((cxl_is_power8()) || (!(cxl_is_power9_dd1()))) cxl_data_cache_flush(adapter); cxl_deconfigure_adapter(adapter); From d6bd8194e2867e85ac2de63486d3b83ccfae4e62 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 26 Jun 2017 11:30:57 +1000 Subject: [PATCH 25/25] powerpc/32: Avoid miscompilation w/GCC 4.6.3 - don't inline copy_to/from_user() Larry Finger reported that his Powerbook G4 was no longer booting with v4.12-rc, userspace was up but giving weird errors such as: udevd[64]: starting version 175 udevd[64]: Unable to receive ctrl message: Bad address. modprobe: chdir(4.12-rc1): No such file or directory He bisected the problem to commit 3448890c32c3 ("powerpc: get rid of zeroing, switch to RAW_COPY_USER"). Al identified that the problem is actually a miscompilation by GCC 4.6.3, which is exposed by the above commit. Al also pointed out that inlining copy_to/from_user() is probably of little or no benefit, which is correct. Using Anton's copy_to_user benchmark, with a pathological single byte copy, we see a small increase in performance by *removing* inlining: Before (inlined): # time ./copy_to_user -w -l 1 -i 10000000 ( x 3 ) real 0m22.063s real 0m22.059s real 0m22.076s After: # time ./copy_to_user -w -l 1 -i 10000000 ( x 3 ) real 0m21.325s real 0m21.299s real 0m21.364s So as a small performance improvement and to avoid the miscompilation, drop inlining copy_to/from_user() on 32-bit. Fixes: 3448890c32c3 ("powerpc: get rid of zeroing, switch to RAW_COPY_USER") Reported-by: Larry Finger Suggested-by: Al Viro Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/uaccess.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 5c0d8a8cdae5..41e88d3ce36b 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -267,13 +267,7 @@ do { \ extern unsigned long __copy_tofrom_user(void __user *to, const void __user *from, unsigned long size); -#ifndef __powerpc64__ - -#define INLINE_COPY_FROM_USER -#define INLINE_COPY_TO_USER - -#else /* __powerpc64__ */ - +#ifdef __powerpc64__ static inline unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) {