From 8dec9cb9f525b6c51e3467202b9c627591c472f8 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Fri, 8 Mar 2024 15:54:43 +0100 Subject: [PATCH 01/54] s390/ap: use static qci information Since qci is available on most of the current machines, move away from the dynamic buffers for qci information and store it instead in a statically defined buffer. The new flags member in struct ap_config_info is now used as an indicator, if qci is available in the system (at least one of these bits is set). Suggested-by: Harald Freudenberger Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/ap.h | 19 +++++---- drivers/s390/crypto/ap_bus.c | 74 +++++++++--------------------------- 2 files changed, 30 insertions(+), 63 deletions(-) diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index 43ac4a64f49b..2c5c907b1517 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -223,13 +223,18 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit) * config info as returned by the ap_qci() function. */ struct ap_config_info { - unsigned int apsc : 1; /* S bit */ - unsigned int apxa : 1; /* N bit */ - unsigned int qact : 1; /* C bit */ - unsigned int rc8a : 1; /* R bit */ - unsigned int : 4; - unsigned int apsb : 1; /* B bit */ - unsigned int : 23; + union { + unsigned int flags; + struct { + unsigned int apsc : 1; /* S bit */ + unsigned int apxa : 1; /* N bit */ + unsigned int qact : 1; /* C bit */ + unsigned int rc8a : 1; /* R bit */ + unsigned int : 4; + unsigned int apsb : 1; /* B bit */ + unsigned int : 23; + }; + }; unsigned char na; /* max # of APs - 1 */ unsigned char nd; /* max # of Domains - 1 */ unsigned char _reserved0[10]; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index cce0bafd4c92..d8f5882a8eaf 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -90,8 +90,9 @@ static atomic64_t ap_bindings_complete_count = ATOMIC64_INIT(0); /* completion for APQN bindings complete */ static DECLARE_COMPLETION(ap_apqn_bindings_complete); -static struct ap_config_info *ap_qci_info; -static struct ap_config_info *ap_qci_info_old; +static struct ap_config_info qci[2]; +static struct ap_config_info *const ap_qci_info = &qci[0]; +static struct ap_config_info *const ap_qci_info_old = &qci[1]; /* * AP bus related debug feature things. @@ -203,9 +204,7 @@ static int ap_apft_available(void) */ static inline int ap_qact_available(void) { - if (ap_qci_info) - return ap_qci_info->qact; - return 0; + return ap_qci_info->qact; } /* @@ -215,9 +214,7 @@ static inline int ap_qact_available(void) */ int ap_sb_available(void) { - if (ap_qci_info) - return ap_qci_info->apsb; - return 0; + return ap_qci_info->apsb; } /* @@ -229,23 +226,6 @@ bool ap_is_se_guest(void) } EXPORT_SYMBOL(ap_is_se_guest); -/* - * ap_fetch_qci_info(): Fetch cryptographic config info - * - * Returns the ap configuration info fetched via PQAP(QCI). - * On success 0 is returned, on failure a negative errno - * is returned, e.g. if the PQAP(QCI) instruction is not - * available, the return value will be -EOPNOTSUPP. - */ -static inline int ap_fetch_qci_info(struct ap_config_info *info) -{ - if (!ap_qci_available()) - return -EOPNOTSUPP; - if (!info) - return -EINVAL; - return ap_qci(info); -} - /** * ap_init_qci_info(): Allocate and query qci config info. * Does also update the static variables ap_max_domain_id @@ -253,27 +233,12 @@ static inline int ap_fetch_qci_info(struct ap_config_info *info) */ static void __init ap_init_qci_info(void) { - if (!ap_qci_available()) { + if (!ap_qci_available() || + ap_qci(ap_qci_info)) { AP_DBF_INFO("%s QCI not supported\n", __func__); return; } - - ap_qci_info = kzalloc(sizeof(*ap_qci_info), GFP_KERNEL); - if (!ap_qci_info) - return; - ap_qci_info_old = kzalloc(sizeof(*ap_qci_info_old), GFP_KERNEL); - if (!ap_qci_info_old) { - kfree(ap_qci_info); - ap_qci_info = NULL; - return; - } - if (ap_fetch_qci_info(ap_qci_info) != 0) { - kfree(ap_qci_info); - kfree(ap_qci_info_old); - ap_qci_info = NULL; - ap_qci_info_old = NULL; - return; - } + memcpy(ap_qci_info_old, ap_qci_info, sizeof(*ap_qci_info)); AP_DBF_INFO("%s successful fetched initial qci info\n", __func__); if (ap_qci_info->apxa) { @@ -288,8 +253,6 @@ static void __init ap_init_qci_info(void) __func__, ap_max_domain_id); } } - - memcpy(ap_qci_info_old, ap_qci_info, sizeof(*ap_qci_info)); } /* @@ -312,7 +275,7 @@ static inline int ap_test_config_card_id(unsigned int id) { if (id > ap_max_adapter_id) return 0; - if (ap_qci_info) + if (ap_qci_info->flags) return ap_test_config(ap_qci_info->apm, id); return 1; } @@ -329,7 +292,7 @@ int ap_test_config_usage_domain(unsigned int domain) { if (domain > ap_max_domain_id) return 0; - if (ap_qci_info) + if (ap_qci_info->flags) return ap_test_config(ap_qci_info->aqm, domain); return 1; } @@ -1234,7 +1197,7 @@ static BUS_ATTR_RW(ap_domain); static ssize_t ap_control_domain_mask_show(const struct bus_type *bus, char *buf) { - if (!ap_qci_info) /* QCI not supported */ + if (!ap_qci_info->flags) /* QCI not supported */ return sysfs_emit(buf, "not supported\n"); return sysfs_emit(buf, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n", @@ -1248,7 +1211,7 @@ static BUS_ATTR_RO(ap_control_domain_mask); static ssize_t ap_usage_domain_mask_show(const struct bus_type *bus, char *buf) { - if (!ap_qci_info) /* QCI not supported */ + if (!ap_qci_info->flags) /* QCI not supported */ return sysfs_emit(buf, "not supported\n"); return sysfs_emit(buf, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n", @@ -1262,7 +1225,7 @@ static BUS_ATTR_RO(ap_usage_domain_mask); static ssize_t ap_adapter_mask_show(const struct bus_type *bus, char *buf) { - if (!ap_qci_info) /* QCI not supported */ + if (!ap_qci_info->flags) /* QCI not supported */ return sysfs_emit(buf, "not supported\n"); return sysfs_emit(buf, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n", @@ -1595,7 +1558,7 @@ static ssize_t features_show(const struct bus_type *bus, char *buf) { int n = 0; - if (!ap_qci_info) /* QCI not supported */ + if (!ap_qci_info->flags) /* QCI not supported */ return sysfs_emit(buf, "-\n"); if (ap_qci_info->apsc) @@ -2158,11 +2121,11 @@ static inline void ap_scan_adapter(int ap) */ static bool ap_get_configuration(void) { - if (!ap_qci_info) /* QCI not supported */ + if (!ap_qci_info->flags) /* QCI not supported */ return false; memcpy(ap_qci_info_old, ap_qci_info, sizeof(*ap_qci_info)); - ap_fetch_qci_info(ap_qci_info); + ap_qci(ap_qci_info); return memcmp(ap_qci_info, ap_qci_info_old, sizeof(struct ap_config_info)) != 0; @@ -2179,7 +2142,7 @@ static bool ap_config_has_new_aps(void) unsigned long m[BITS_TO_LONGS(AP_DEVICES)]; - if (!ap_qci_info) + if (!ap_qci_info->flags) return false; bitmap_andnot(m, (unsigned long *)ap_qci_info->apm, @@ -2200,7 +2163,7 @@ static bool ap_config_has_new_doms(void) { unsigned long m[BITS_TO_LONGS(AP_DOMAINS)]; - if (!ap_qci_info) + if (!ap_qci_info->flags) return false; bitmap_andnot(m, (unsigned long *)ap_qci_info->aqm, @@ -2427,7 +2390,6 @@ static int __init ap_module_init(void) out: if (ap_irq_flag) unregister_adapter_interrupt(&ap_airq); - kfree(ap_qci_info); return rc; } device_initcall(ap_module_init); From 170660ccf8806742052028093fb45872d2be4775 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Fri, 8 Mar 2024 17:13:48 +0100 Subject: [PATCH 02/54] s390/ap: rework ap initialization Rework the ap initialization and add missing cleanups to the error path. Errors during the registration of IRQ handler is now also detected. Suggested-by: Heiko Carstens Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/ap_bus.c | 102 ++++++++++++++++++++++++----------- 1 file changed, 72 insertions(+), 30 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index d8f5882a8eaf..aabaf93004bc 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -2273,7 +2273,66 @@ static void ap_scan_bus_wq_callback(struct work_struct *unused) } } -static int __init ap_debug_init(void) +static inline int __init ap_async_init(void) +{ + int rc; + + /* Setup the AP bus rescan timer. */ + timer_setup(&ap_scan_bus_timer, ap_scan_bus_timer_callback, 0); + + /* + * Setup the high resolution poll timer. + * If we are running under z/VM adjust polling to z/VM polling rate. + */ + if (MACHINE_IS_VM) + poll_high_timeout = 1500000; + hrtimer_init(&ap_poll_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ap_poll_timer.function = ap_poll_timeout; + + queue_work(system_long_wq, &ap_scan_bus_work); + + /* Start the low priority AP bus poll thread. */ + if (!ap_thread_flag) + return 0; + + rc = ap_poll_thread_start(); + if (rc) + goto out; + + return 0; + +out: + cancel_work(&ap_scan_bus_work); + hrtimer_cancel(&ap_poll_timer); + timer_delete(&ap_scan_bus_timer); + return rc; +} + +static inline void ap_irq_exit(void) +{ + if (ap_irq_flag) + unregister_adapter_interrupt(&ap_airq); +} + +static inline int __init ap_irq_init(void) +{ + int rc; + + if (!ap_interrupts_available() || !ap_useirq) + return 0; + + rc = register_adapter_interrupt(&ap_airq); + ap_irq_flag = (rc == 0); + + return rc; +} + +static inline void ap_debug_exit(void) +{ + debug_unregister(ap_dbf_info); +} + +static inline int __init ap_debug_init(void) { ap_dbf_info = debug_register("ap", 2, 1, AP_DBF_MAX_SPRINTF_ARGS * sizeof(long)); @@ -2342,15 +2401,14 @@ static int __init ap_module_init(void) } /* enable interrupts if available */ - if (ap_interrupts_available() && ap_useirq) { - rc = register_adapter_interrupt(&ap_airq); - ap_irq_flag = (rc == 0); - } + rc = ap_irq_init(); + if (rc) + goto out; /* Create /sys/bus/ap. */ rc = bus_register(&ap_bus_type); if (rc) - goto out; + goto out_irq; /* Create /sys/devices/ap. */ ap_root_device = root_device_register("ap"); @@ -2359,37 +2417,21 @@ static int __init ap_module_init(void) goto out_bus; ap_root_device->bus = &ap_bus_type; - /* Setup the AP bus rescan timer. */ - timer_setup(&ap_scan_bus_timer, ap_scan_bus_timer_callback, 0); - - /* - * Setup the high resolution poll timer. - * If we are running under z/VM adjust polling to z/VM polling rate. - */ - if (MACHINE_IS_VM) - poll_high_timeout = 1500000; - hrtimer_init(&ap_poll_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - ap_poll_timer.function = ap_poll_timeout; - - /* Start the low priority AP bus poll thread. */ - if (ap_thread_flag) { - rc = ap_poll_thread_start(); - if (rc) - goto out_work; - } - - queue_work(system_long_wq, &ap_scan_bus_work); + /* Setup asynchronous work (timers, workqueue, etc). */ + rc = ap_async_init(); + if (rc) + goto out_device; return 0; -out_work: - hrtimer_cancel(&ap_poll_timer); +out_device: root_device_unregister(ap_root_device); out_bus: bus_unregister(&ap_bus_type); +out_irq: + ap_irq_exit(); out: - if (ap_irq_flag) - unregister_adapter_interrupt(&ap_airq); + ap_debug_exit(); return rc; } device_initcall(ap_module_init); From 3c7a377324cf9f248041c0e7295728eebd6c9b14 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Fri, 8 Mar 2024 17:23:25 +0100 Subject: [PATCH 03/54] s390/ap: swap IRQ and bus/device registration The IRQ handler may rely on the bus or the root device. Register the adapter IRQ after setting up the bus and the root device to avoid any race conditions. Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/ap_bus.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index aabaf93004bc..cdfaa321b44e 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -2400,15 +2400,10 @@ static int __init ap_module_init(void) ap_domain_index = -1; } - /* enable interrupts if available */ - rc = ap_irq_init(); - if (rc) - goto out; - /* Create /sys/bus/ap. */ rc = bus_register(&ap_bus_type); if (rc) - goto out_irq; + goto out; /* Create /sys/devices/ap. */ ap_root_device = root_device_register("ap"); @@ -2417,19 +2412,24 @@ static int __init ap_module_init(void) goto out_bus; ap_root_device->bus = &ap_bus_type; - /* Setup asynchronous work (timers, workqueue, etc). */ - rc = ap_async_init(); + /* enable interrupts if available */ + rc = ap_irq_init(); if (rc) goto out_device; + /* Setup asynchronous work (timers, workqueue, etc). */ + rc = ap_async_init(); + if (rc) + goto out_irq; + return 0; +out_irq: + ap_irq_exit(); out_device: root_device_unregister(ap_root_device); out_bus: bus_unregister(&ap_bus_type); -out_irq: - ap_irq_exit(); out: ap_debug_exit(); return rc; From 05272aa499c48f230d862577d423de3e2b268e47 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Thu, 15 Feb 2024 08:59:45 +0100 Subject: [PATCH 04/54] s390/uv: export prot_virt_guest symbol in uv The inline function is_prot_virt_guest() in asm/uv.h makes use of the prot_virt_guest symbol. As this inline function can be called by other parts of the kernel (modules and built-in), the symbol should be exported, similar to the prot_virt_host symbol. One consumer of is_prot_virt_guest() will be the ap bus code. Cc: Janosch Frank Cc: Claudio Imbrenda Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Acked-by: Claudio Imbrenda Signed-off-by: Alexander Gordeev --- arch/s390/kernel/uv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index fc07bc39e698..2961db043fa4 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -21,6 +21,7 @@ /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); +EXPORT_SYMBOL(prot_virt_guest); #endif /* From 2a483d333fd84acc009c7199fdd962af3ffc6b3b Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Mon, 19 Feb 2024 17:32:54 +0100 Subject: [PATCH 05/54] s390/chsc: use notifier for AP configuration changes The direct dependency of chsc and the AP bus prevents the modularization of ap bus. Introduce a notifier interface for AP changes, which decouples the producer of the change events (chsc) from the consumer (ap_bus). Remove the ap_cfg_chg() interface and replace it with the notifier invocation. The ap bus module registers a notification handler, which triggers the AP bus scan. Cc: Vineeth Vijayan Cc: Peter Oberparleiter Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Acked-by: Vineeth Vijayan Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/ap.h | 11 ----------- arch/s390/include/asm/chsc.h | 15 +++++++++++++++ drivers/s390/cio/chsc.c | 18 ++++++++++++++++-- drivers/s390/crypto/ap_bus.c | 21 +++++++++++++++++++-- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index 2c5c907b1517..395b02d6a133 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -549,15 +549,4 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, return reg1.status; } -/* - * Interface to tell the AP bus code that a configuration - * change has happened. The bus code should at least do - * an ap bus resource rescan. - */ -#if IS_ENABLED(CONFIG_ZCRYPT) -void ap_bus_cfg_chg(void); -#else -static inline void ap_bus_cfg_chg(void){} -#endif - #endif /* _ASM_S390_AP_H_ */ diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h index bb48ea380c0d..bb78159d8042 100644 --- a/arch/s390/include/asm/chsc.h +++ b/arch/s390/include/asm/chsc.h @@ -11,6 +11,9 @@ #include +/* struct from linux/notifier.h */ +struct notifier_block; + /** * Operation codes for CHSC PNSO: * PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport @@ -66,4 +69,16 @@ struct chsc_pnso_area { struct chsc_pnso_naid_l2 entries[]; } __packed __aligned(PAGE_SIZE); +/* + * notifier interface - registered notifiers gets called on + * the following events: + * - ap config changed (CHSC_NOTIFY_AP_CFG) + */ +enum chsc_notify_type { + CHSC_NOTIFY_AP_CFG = 3, +}; + +int chsc_notifier_register(struct notifier_block *nb); +int chsc_notifier_unregister(struct notifier_block *nb); + #endif /* _ASM_S390_CHSC_H */ diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 44ea76f9e1de..89e51f197cdd 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -24,7 +24,6 @@ #include #include #include -#include #include "css.h" #include "cio.h" @@ -40,6 +39,20 @@ static DEFINE_SPINLOCK(chsc_page_lock); #define SEI_VF_FLA 0xc0 /* VF flag for Full Link Address */ #define SEI_RS_CHPID 0x4 /* 4 in RS field indicates CHPID */ +static BLOCKING_NOTIFIER_HEAD(chsc_notifiers); + +int chsc_notifier_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&chsc_notifiers, nb); +} +EXPORT_SYMBOL(chsc_notifier_register); + +int chsc_notifier_unregister(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&chsc_notifiers, nb); +} +EXPORT_SYMBOL(chsc_notifier_unregister); + /** * chsc_error_from_response() - convert a chsc response to an error * @response: chsc response code @@ -581,7 +594,8 @@ static void chsc_process_sei_ap_cfg_chg(struct chsc_sei_nt0_area *sei_area) if (sei_area->rs != 5) return; - ap_bus_cfg_chg(); + blocking_notifier_call_chain(&chsc_notifiers, + CHSC_NOTIFY_AP_CFG, NULL); } static void chsc_process_sei_fces_event(struct chsc_sei_nt0_area *sei_area) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index cdfaa321b44e..9284ae63074d 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "ap_bus.h" #include "ap_debug.h" @@ -1024,13 +1025,23 @@ EXPORT_SYMBOL(ap_bus_force_rescan); /* * A config change has happened, force an ap bus rescan. */ -void ap_bus_cfg_chg(void) +static int ap_bus_cfg_chg(struct notifier_block *nb, + unsigned long action, void *data) { + if (action != CHSC_NOTIFY_AP_CFG) + return NOTIFY_DONE; + pr_debug("%s config change, forcing bus rescan\n", __func__); ap_bus_force_rescan(); + + return NOTIFY_OK; } +static struct notifier_block ap_bus_nb = { + .notifier_call = ap_bus_cfg_chg, +}; + /* * hex2bitmap() - parse hex mask string and set bitmap. * Valid strings are "0x012345678" with at least one valid hex number. @@ -2291,16 +2302,22 @@ static inline int __init ap_async_init(void) queue_work(system_long_wq, &ap_scan_bus_work); + rc = chsc_notifier_register(&ap_bus_nb); + if (rc) + goto out; + /* Start the low priority AP bus poll thread. */ if (!ap_thread_flag) return 0; rc = ap_poll_thread_start(); if (rc) - goto out; + goto out_notifier; return 0; +out_notifier: + chsc_notifier_unregister(&ap_bus_nb); out: cancel_work(&ap_scan_bus_work); hrtimer_cancel(&ap_poll_timer); From 123760841a2e5977d4e97f86999b3784df58801d Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Mon, 19 Feb 2024 18:10:19 +0100 Subject: [PATCH 06/54] s390/ap: modularize ap bus There is no hard requirement to have the ap bus statically in the kernel, so add an option to compile it as module. Cc: Tony Krowiak Cc: Halil Pasic Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Reviewed-by: Anthony Krowiak Signed-off-by: Alexander Gordeev --- Documentation/arch/s390/vfio-ap.rst | 2 +- arch/s390/Kconfig | 15 ++++++++++++++- drivers/crypto/Kconfig | 1 + drivers/s390/char/Makefile | 2 +- drivers/s390/crypto/Makefile | 2 +- drivers/s390/crypto/ap_bus.c | 30 +++++++++++++++++++++++++---- 6 files changed, 44 insertions(+), 8 deletions(-) diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst index 929ee1c1c940..cf8533f2a8b6 100644 --- a/Documentation/arch/s390/vfio-ap.rst +++ b/Documentation/arch/s390/vfio-ap.rst @@ -550,7 +550,7 @@ These are the steps: following Kconfig elements selected: * IOMMU_SUPPORT * S390 - * ZCRYPT + * AP * VFIO * KVM diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8f01ada6845e..db7eb4d77d38 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -724,6 +724,19 @@ config EADM_SCH To compile this driver as a module, choose M here: the module will be called eadm_sch. +config AP + def_tristate y + prompt "Support for Adjunct Processors (ap)" + help + This driver allows usage to Adjunct Processor (AP) devices via + the ap bus, cards and queues. Supported Adjunct Processors are + the CryptoExpress Cards (CEX). + + To compile this driver as a module, choose M here: the + module will be called ap. + + If unsure, say Y (default). + config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" @@ -740,7 +753,7 @@ config VFIO_AP prompt "VFIO support for AP devices" depends on KVM depends on VFIO - depends on ZCRYPT + depends on AP select VFIO_MDEV help This driver grants access to Adjunct Processor (AP) devices diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 3d02702456a5..16385658fd35 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -67,6 +67,7 @@ config CRYPTO_DEV_GEODE config ZCRYPT tristate "Support for s390 cryptographic adapters" depends on S390 + depends on AP select HW_RANDOM help Select this option if you want to enable support for diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile index b0f6b3201636..81d6744e1861 100644 --- a/drivers/s390/char/Makefile +++ b/drivers/s390/char/Makefile @@ -32,7 +32,7 @@ obj-$(CONFIG_SCLP_VT220_TTY) += sclp_vt220.o obj-$(CONFIG_PCI) += sclp_pci.o -obj-$(subst m,y,$(CONFIG_ZCRYPT)) += sclp_ap.o +obj-$(subst m,y,$(CONFIG_AP)) += sclp_ap.o obj-$(CONFIG_VMLOGRDR) += vmlogrdr.o obj-$(CONFIG_VMCP) += vmcp.o diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile index 0edacd101c12..bd94811fd9f1 100644 --- a/drivers/s390/crypto/Makefile +++ b/drivers/s390/crypto/Makefile @@ -4,7 +4,7 @@ # ap-objs := ap_bus.o ap_card.o ap_queue.o -obj-$(subst m,y,$(CONFIG_ZCRYPT)) += ap.o +obj-$(CONFIG_AP) += ap.o # zcrypt_api.o and zcrypt_msgtype*.o depend on ap.o zcrypt-objs := zcrypt_api.o zcrypt_card.o zcrypt_queue.o zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 9284ae63074d..09059b3a3a42 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -44,9 +44,10 @@ #include "ap_bus.h" #include "ap_debug.h" -/* - * Module parameters; note though this file itself isn't modular. - */ +MODULE_AUTHOR("IBM Corporation"); +MODULE_DESCRIPTION("Adjunct Processor Bus driver"); +MODULE_LICENSE("GPL"); + int ap_domain_index = -1; /* Adjunct Processor Domain Index */ static DEFINE_SPINLOCK(ap_domain_lock); module_param_named(domain, ap_domain_index, int, 0440); @@ -2284,6 +2285,16 @@ static void ap_scan_bus_wq_callback(struct work_struct *unused) } } +static inline void __exit ap_async_exit(void) +{ + if (ap_thread_flag) + ap_poll_thread_stop(); + chsc_notifier_unregister(&ap_bus_nb); + cancel_work(&ap_scan_bus_work); + hrtimer_cancel(&ap_poll_timer); + timer_delete(&ap_scan_bus_timer); +} + static inline int __init ap_async_init(void) { int rc; @@ -2451,4 +2462,15 @@ static int __init ap_module_init(void) ap_debug_exit(); return rc; } -device_initcall(ap_module_init); + +static void __exit ap_module_exit(void) +{ + ap_async_exit(); + ap_irq_exit(); + root_device_unregister(ap_root_device); + bus_unregister(&ap_bus_type); + ap_debug_exit(); +} + +module_init(ap_module_init); +module_exit(ap_module_exit); From b3840c8bfc27c1e8dc3953d6a27960ae390d5d80 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Tue, 27 Feb 2024 16:49:33 +0100 Subject: [PATCH 07/54] s390/ap: rename ap debug configuration option The configuration option ZCRYPT_DEBUG is used only in ap queue code, so rename it to AP_DEBUG. It also no longer depends on ZCRYPT but on AP. While at it, also update the help text. Signed-off-by: Holger Dengler Reviewed-by: Harald Freudenberger Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 14 ++++++++++++++ drivers/crypto/Kconfig | 17 ----------------- drivers/s390/crypto/ap_queue.c | 4 ++-- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index db7eb4d77d38..b9857aacd40b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -737,6 +737,20 @@ config AP If unsure, say Y (default). +config AP_DEBUG + def_bool n + prompt "Enable debug features for Adjunct Processor (ap) devices" + depends on AP + help + Say 'Y' here to enable some additional debug features for Adjunct + Processor (ap) devices. + + There will be some more sysfs attributes displayed for ap queues. + + Do not enable on production level kernel build. + + If unsure, say N. + config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 16385658fd35..bfe18ebc2a98 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -75,23 +75,6 @@ config ZCRYPT to 8 in Coprocessor (CEXxC), EP11 Coprocessor (CEXxP) or Accelerator (CEXxA) mode. -config ZCRYPT_DEBUG - bool "Enable debug features for s390 cryptographic adapters" - default n - depends on DEBUG_KERNEL - depends on ZCRYPT - help - Say 'Y' here to enable some additional debug features on the - s390 cryptographic adapters driver. - - There will be some more sysfs attributes displayed for ap cards - and queues and some flags on crypto requests are interpreted as - debugging messages to force error injection. - - Do not enable on production level kernel build. - - If unsure, say N. - config PKEY tristate "Kernel API for protected key handling" depends on S390 diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 6e4e8d324a6d..1f647ffd6f4d 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -708,7 +708,7 @@ static ssize_t ap_functions_show(struct device *dev, static DEVICE_ATTR_RO(ap_functions); -#ifdef CONFIG_ZCRYPT_DEBUG +#ifdef CONFIG_AP_DEBUG static ssize_t states_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -820,7 +820,7 @@ static struct attribute *ap_queue_dev_attrs[] = { &dev_attr_config.attr, &dev_attr_chkstop.attr, &dev_attr_ap_functions.attr, -#ifdef CONFIG_ZCRYPT_DEBUG +#ifdef CONFIG_AP_DEBUG &dev_attr_states.attr, &dev_attr_last_err_rc.attr, #endif From aaebea959efb2cccd870990f1b6016ff324b0fb6 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 14 Mar 2024 17:52:09 +0800 Subject: [PATCH 08/54] s390/cio: convert sprintf()/snprintf() to sysfs_emit() Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). Generally, this patch is generated by make coccicheck M= MODE=patch \ COCCI=scripts/coccinelle/api/device_attr_show.cocci No functional change intended. Cc: Vineeth Vijayan Cc: Peter Oberparleiter Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240314095209.1325229-1-lizhijian@fujitsu.com Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- drivers/s390/cio/css.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 1d68db1a3d4e..781f84901256 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -309,7 +309,7 @@ static ssize_t type_show(struct device *dev, struct device_attribute *attr, { struct subchannel *sch = to_subchannel(dev); - return sprintf(buf, "%01x\n", sch->st); + return sysfs_emit(buf, "%01x\n", sch->st); } static DEVICE_ATTR_RO(type); @@ -319,7 +319,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, { struct subchannel *sch = to_subchannel(dev); - return sprintf(buf, "css:t%01X\n", sch->st); + return sysfs_emit(buf, "css:t%01X\n", sch->st); } static DEVICE_ATTR_RO(modalias); @@ -345,7 +345,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", sch->driver_override); + len = sysfs_emit(buf, "%s\n", sch->driver_override); device_unlock(dev); return len; } @@ -396,8 +396,8 @@ static ssize_t pimpampom_show(struct device *dev, struct subchannel *sch = to_subchannel(dev); struct pmcw *pmcw = &sch->schib.pmcw; - return sprintf(buf, "%02x %02x %02x\n", - pmcw->pim, pmcw->pam, pmcw->pom); + return sysfs_emit(buf, "%02x %02x %02x\n", + pmcw->pim, pmcw->pam, pmcw->pom); } static DEVICE_ATTR_RO(pimpampom); @@ -881,7 +881,7 @@ static ssize_t real_cssid_show(struct device *dev, struct device_attribute *a, if (!css->id_valid) return -EINVAL; - return sprintf(buf, "%x\n", css->cssid); + return sysfs_emit(buf, "%x\n", css->cssid); } static DEVICE_ATTR_RO(real_cssid); @@ -904,7 +904,7 @@ static ssize_t cm_enable_show(struct device *dev, struct device_attribute *a, int ret; mutex_lock(&css->mutex); - ret = sprintf(buf, "%x\n", css->cm_enabled); + ret = sysfs_emit(buf, "%x\n", css->cm_enabled); mutex_unlock(&css->mutex); return ret; } From 4f00d4ef6634b31d6026b9bf6bb1a90c889e2347 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Mar 2024 11:52:23 +0100 Subject: [PATCH 09/54] s390: adjust indentation of RELOCS command build step out Common pattern in non-verbose build output for quiet commands is that the shorthand of a command including whitespace contains at least eight characters. Adjust this for the RELOCS command, which comes only with seven characters. Before: SORTTAB vmlinux CC arch/s390/boot/version.o RELOCS arch/s390/boot/relocs.S OBJCOPY arch/s390/boot/info.bin After: SORTTAB vmlinux CC arch/s390/boot/version.o RELOCS arch/s390/boot/relocs.S OBJCOPY arch/s390/boot/info.bin Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 294f08a8811a..bd5d4d37a961 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -112,7 +112,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE ifndef CONFIG_PIE_BUILD CMD_RELOCS=arch/s390/tools/relocs -quiet_cmd_relocs = RELOCS $@ +quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@ $(obj)/relocs.S: vmlinux FORCE $(call if_changed,relocs) From f10933cbd2dfddf6273698a45f76db9bafd8150f Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 27 Mar 2024 09:22:43 +0100 Subject: [PATCH 10/54] s390/cpum_cf: make crypto counters upward compatible across machine types The CPU Measurement facility crypto counter set functionality is defined by the Second Counter Version Number. This number varies between machine types, but is upward compatible. Lessen the checks to reflect this behavior. Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_cpum_cf.c | 2 +- arch/s390/kernel/perf_cpum_cf_events.c | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 41ed6e0f0a2a..1434642e9cba 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -428,7 +428,7 @@ static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset) case CPUMF_CTR_SET_CRYPTO: if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5) ctrset_size = 16; - else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7) + else if (cpumf_ctr_info.csvn >= 6) ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 0d64aafd158f..e4a6bfc91080 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -855,16 +855,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void) } /* Determine version specific crypto set */ - switch (ci.csvn) { - case 1 ... 5: + csvn = none; + if (ci.csvn >= 1 && ci.csvn <= 5) csvn = cpumcf_svn_12345_pmu_event_attr; - break; - case 6 ... 7: + else if (ci.csvn >= 6) csvn = cpumcf_svn_67_pmu_event_attr; - break; - default: - csvn = none; - } /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); From 259e660d91d0e7261ae0ee37bb37266d6006a546 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 22 Mar 2024 16:11:46 +0000 Subject: [PATCH 11/54] s390/mm: Convert make_page_secure to use a folio These page APIs are deprecated, so convert the incoming page to a folio and use the folio APIs instead. The ultravisor API cannot handle large folios, so return -EINVAL if one has slipped through. Acked-by: Claudio Imbrenda Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240322161149.2327518-2-willy@infradead.org Signed-off-by: Alexander Gordeev --- arch/s390/kernel/uv.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 2961db043fa4..4ae1571572d0 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -182,21 +182,21 @@ int uv_convert_owned_from_secure(unsigned long paddr) } /* - * Calculate the expected ref_count for a page that would otherwise have no + * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in * the kernel, but with some slight modifications. We know that a secure - * page can not be a huge page for example. + * folio can not be a large folio, for example. */ -static int expected_page_refs(struct page *page) +static int expected_folio_refs(struct folio *folio) { int res; - res = page_mapcount(page); - if (PageSwapCache(page)) { + res = folio_mapcount(folio); + if (folio_test_swapcache(folio)) { res++; - } else if (page_mapping(page)) { + } else if (folio_mapping(folio)) { res++; - if (page_has_private(page)) + if (folio->private) res++; } return res; @@ -204,14 +204,17 @@ static int expected_page_refs(struct page *page) static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) { + struct folio *folio = page_folio(page); int expected, cc = 0; - if (PageWriteback(page)) + if (folio_test_large(folio)) + return -EINVAL; + if (folio_test_writeback(folio)) return -EAGAIN; - expected = expected_page_refs(page); - if (!page_ref_freeze(page, expected)) + expected = expected_folio_refs(folio); + if (!folio_ref_freeze(folio, expected)) return -EBUSY; - set_bit(PG_arch_1, &page->flags); + set_bit(PG_arch_1, &folio->flags); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -221,9 +224,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) * -EAGAIN and we let the callers deal with it. */ cc = __uv_call(0, (u64)uvcb); - page_ref_unfreeze(page, expected); + folio_ref_unfreeze(folio, expected); /* - * Return -ENXIO if the page was not mapped, -EINVAL for other errors. + * Return -ENXIO if the folio was not mapped, -EINVAL for other errors. * If busy or partially completed, return -EAGAIN. */ if (cc == UVC_CC_OK) From d35c34bb32f2cc4ec0b52e91ad7a8fcab55d7856 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 22 Mar 2024 16:11:47 +0000 Subject: [PATCH 12/54] s390/mm: Convert gmap_make_secure to use a folio Remove uses of deprecated page APIs, and move the check for large folios to here to avoid taking the folio lock if the folio is too large. We could do better here by attempting to split the large folio, but I'll leave that improvement for someone who can test it. Acked-by: Claudio Imbrenda Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240322161149.2327518-3-willy@infradead.org Signed-off-by: Alexander Gordeev --- arch/s390/kernel/uv.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 4ae1571572d0..265fea37e030 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -202,13 +202,10 @@ static int expected_folio_refs(struct folio *folio) return res; } -static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) +static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { - struct folio *folio = page_folio(page); int expected, cc = 0; - if (folio_test_large(folio)) - return -EINVAL; if (folio_test_writeback(folio)) return -EAGAIN; expected = expected_folio_refs(folio); @@ -281,7 +278,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) bool local_drain = false; spinlock_t *ptelock; unsigned long uaddr; - struct page *page; + struct folio *folio; pte_t *ptep; int rc; @@ -310,15 +307,19 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) if (!ptep) goto out; if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { - page = pte_page(*ptep); + folio = page_folio(pte_page(*ptep)); + rc = -EINVAL; + if (folio_test_large(folio)) + goto unlock; rc = -EAGAIN; - if (trylock_page(page)) { + if (folio_trylock(folio)) { if (should_export_before_import(uvcb, gmap->mm)) - uv_convert_from_secure(page_to_phys(page)); - rc = make_page_secure(page, uvcb); - unlock_page(page); + uv_convert_from_secure(PFN_PHYS(folio_pfn(folio))); + rc = make_folio_secure(folio, uvcb); + folio_unlock(folio); } } +unlock: pte_unmap_unlock(ptep, ptelock); out: mmap_read_unlock(gmap->mm); @@ -328,10 +329,10 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) * If we are here because the UVC returned busy or partial * completion, this is just a useless check, but it is safe. */ - wait_on_page_writeback(page); + folio_wait_writeback(folio); } else if (rc == -EBUSY) { /* - * If we have tried a local drain and the page refcount + * If we have tried a local drain and the folio refcount * still does not match our expected safe value, try with a * system wide drain. This is needed if the pagevecs holding * the page are on a different CPU. @@ -342,7 +343,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) return -EAGAIN; } /* - * We are here if the page refcount does not match the + * We are here if the folio refcount does not match the * expected safe value. The main culprits are usually * pagevecs. With lru_add_drain() we drain the pagevecs * on the local CPU so that hopefully the refcount will From a817d98dc2e8f9c252bd3e31d86dfe61ec76d23f Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 26 Mar 2024 17:03:20 +0100 Subject: [PATCH 13/54] s390/cio: rework channel-utilization-block handling Convert channel-utilization-block (CUB) address variables from separate named fields to arrays of addresses. Also simplify error handling and introduce named constants. This is done in preparation of introducing additional CUBs. Note: With this change the __packed annotation of secm_area is required to prevent an alignment hole that would otherwise occur due to the switch from u32 to dma64_t. Reviewed-by: Vineeth Vijayan Acked-by: Heiko Carstens Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/chp.c | 8 +++--- drivers/s390/cio/chsc.c | 56 +++++++++++++++++++++++++++-------------- drivers/s390/cio/css.h | 10 ++++++-- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 675d7ed82356..0edb6dd0f105 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -161,12 +161,12 @@ static void chp_measurement_copy_block(struct cmg_entry *buf, struct cmg_entry *entry, reference_buf; int idx; - if (chpid.id < 128) { - area = css->cub_addr1; + if (chpid.id < CSS_CUES_PER_PAGE) { + area = css->cub[0]; idx = chpid.id; } else { - area = css->cub_addr2; - idx = chpid.id - 128; + area = css->cub[1]; + idx = chpid.id - CSS_CUES_PER_PAGE; } entry = area + (idx * sizeof(struct cmg_entry)); do { diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 89e51f197cdd..3344fa996ec4 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -874,19 +874,16 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable) u32 : 30; u32 key : 4; u32 : 28; - u32 zeroes1; - dma32_t cub_addr1; - u32 zeroes2; - dma32_t cub_addr2; + dma64_t cub[CSS_NUM_CUB_PAGES]; u32 reserved[13]; struct chsc_header response; u32 status : 8; u32 : 4; u32 fmt : 4; u32 : 16; - } *secm_area; + } __packed *secm_area; unsigned long flags; - int ret, ccode; + int ret, ccode, i; spin_lock_irqsave(&chsc_page_lock, flags); memset(chsc_page, 0, PAGE_SIZE); @@ -895,8 +892,9 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable) secm_area->request.code = 0x0016; secm_area->key = PAGE_DEFAULT_KEY >> 4; - secm_area->cub_addr1 = virt_to_dma32(css->cub_addr1); - secm_area->cub_addr2 = virt_to_dma32(css->cub_addr2); + + for (i = 0; i < CSS_NUM_CUB_PAGES; i++) + secm_area->cub[i] = (__force dma64_t)virt_to_dma32(css->cub[i]); secm_area->operation_code = enable ? 0 : 1; @@ -922,19 +920,38 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable) return ret; } +static int cub_alloc(struct channel_subsystem *css) +{ + int i; + + for (i = 0; i < CSS_NUM_CUB_PAGES; i++) { + css->cub[i] = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!css->cub[i]) + return -ENOMEM; + } + + return 0; +} + +static void cub_free(struct channel_subsystem *css) +{ + int i; + + for (i = 0; i < CSS_NUM_CUB_PAGES; i++) { + free_page((unsigned long)css->cub[i]); + css->cub[i] = NULL; + } +} + int chsc_secm(struct channel_subsystem *css, int enable) { int ret; if (enable && !css->cm_enabled) { - css->cub_addr1 = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); - css->cub_addr2 = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!css->cub_addr1 || !css->cub_addr2) { - free_page((unsigned long)css->cub_addr1); - free_page((unsigned long)css->cub_addr2); - return -ENOMEM; - } + ret = cub_alloc(css); + if (ret) + goto out; } ret = __chsc_do_secm(css, enable); if (!ret) { @@ -948,10 +965,11 @@ chsc_secm(struct channel_subsystem *css, int enable) } else chsc_remove_cmg_attr(css); } - if (!css->cm_enabled) { - free_page((unsigned long)css->cub_addr1); - free_page((unsigned long)css->cub_addr2); - } + +out: + if (!css->cm_enabled) + cub_free(css); + return ret; } diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h index ea5550554297..aca11dacb224 100644 --- a/drivers/s390/cio/css.h +++ b/drivers/s390/cio/css.h @@ -34,6 +34,13 @@ #define SNID_STATE3_MULTI_PATH 1 #define SNID_STATE3_SINGLE_PATH 0 +/* + * Miscellaneous constants + */ + +#define CSS_NUM_CUB_PAGES 2 +#define CSS_CUES_PER_PAGE 128 + /* * Conditions used to specify which subchannels need evaluation */ @@ -122,8 +129,7 @@ struct channel_subsystem { struct mutex mutex; /* channel measurement related */ int cm_enabled; - void *cub_addr1; - void *cub_addr2; + void *cub[CSS_NUM_CUB_PAGES]; /* for orphaned ccw devices */ struct subchannel *pseudo_subchannel; }; From b4691baaeef07bdc4c3524a0702d7dd6899610b4 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 26 Mar 2024 17:03:21 +0100 Subject: [PATCH 14/54] s390/cio: simplify measurement attribute registration Use attribute groups to simplify registration, removal and extension of measurement related sysfs attributes. Reviewed-by: Vineeth Vijayan Acked-by: Heiko Carstens Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/chp.c | 49 +++++++++++++----------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 0edb6dd0f105..e4a9ce5cacbb 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -127,10 +127,9 @@ static int s390_vary_chpid(struct chp_id chpid, int on) /* * Channel measurement related functions */ -static ssize_t chp_measurement_chars_read(struct file *filp, - struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) +static ssize_t measurement_chars_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) { struct channel_path *chp; struct device *device; @@ -143,15 +142,7 @@ static ssize_t chp_measurement_chars_read(struct file *filp, return memory_read_from_buffer(buf, count, &off, &chp->cmg_chars, sizeof(chp->cmg_chars)); } - -static const struct bin_attribute chp_measurement_chars_attr = { - .attr = { - .name = "measurement_chars", - .mode = S_IRUSR, - }, - .size = sizeof(struct cmg_chars), - .read = chp_measurement_chars_read, -}; +static BIN_ATTR_ADMIN_RO(measurement_chars, sizeof(struct cmg_chars)); static void chp_measurement_copy_block(struct cmg_entry *buf, struct channel_subsystem *css, @@ -175,9 +166,9 @@ static void chp_measurement_copy_block(struct cmg_entry *buf, } while (reference_buf.values[0] != buf->values[0]); } -static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) +static ssize_t measurement_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) { struct channel_path *chp; struct channel_subsystem *css; @@ -197,33 +188,23 @@ static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj, count = size; return count; } +static BIN_ATTR_ADMIN_RO(measurement, sizeof(struct cmg_entry)); -static const struct bin_attribute chp_measurement_attr = { - .attr = { - .name = "measurement", - .mode = S_IRUSR, - }, - .size = sizeof(struct cmg_entry), - .read = chp_measurement_read, +static struct bin_attribute *measurement_attrs[] = { + &bin_attr_measurement_chars, + &bin_attr_measurement, + NULL, }; +BIN_ATTRIBUTE_GROUPS(measurement); void chp_remove_cmg_attr(struct channel_path *chp) { - device_remove_bin_file(&chp->dev, &chp_measurement_chars_attr); - device_remove_bin_file(&chp->dev, &chp_measurement_attr); + device_remove_groups(&chp->dev, measurement_groups); } int chp_add_cmg_attr(struct channel_path *chp) { - int ret; - - ret = device_create_bin_file(&chp->dev, &chp_measurement_chars_attr); - if (ret) - return ret; - ret = device_create_bin_file(&chp->dev, &chp_measurement_attr); - if (ret) - device_remove_bin_file(&chp->dev, &chp_measurement_chars_attr); - return ret; + return device_add_groups(&chp->dev, measurement_groups); } /* From 2dc8903af775ecd51e42c0a2e7afa0e98d1bde45 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 26 Mar 2024 17:03:22 +0100 Subject: [PATCH 15/54] s390/cio: export extended channel-path-measurement data Add a per-CHPID binary sysfs attribute named "ext_measurement" that provides access to extended channel-path-measurement data for the associated channel path. Note that while not all channel-paths provide extended measurement data this attribute is created unconditionally for all channel paths because channel-path measurement capabilities might change during run-time. Reading from the attribute will only return data for channel-paths that support extended measurement data. Example: $ echo 1 > /sys/devices/css0/cm_enable $ xxd /sys/devices/css0/chp0.32/ext_measurement 00000000: 53e0 8002 0000 0095 0000 0000 59cc e034 S...........Y..4 00000010: 38b8 cc45 0000 0000 0000 0000 3e24 fe94 8..E........>$.. 00000020: 0000 0000 0000 0000 0000 0000 0000 0000 ................ 00000030: 0000 0000 0000 0000 0000 0000 0000 0000 ................ Reviewed-by: Vineeth Vijayan Tested-by: Vineeth Vijayan Acked-by: Heiko Carstens Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/chp.c | 76 +++++++++++++++++++++++------------------ drivers/s390/cio/chp.h | 1 + drivers/s390/cio/chsc.c | 24 +++++++++++-- drivers/s390/cio/chsc.h | 5 +++ drivers/s390/cio/css.h | 3 ++ 5 files changed, 73 insertions(+), 36 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index e4a9ce5cacbb..fd7d34ed6ea9 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -144,55 +144,65 @@ static ssize_t measurement_chars_read(struct file *filp, struct kobject *kobj, } static BIN_ATTR_ADMIN_RO(measurement_chars, sizeof(struct cmg_chars)); -static void chp_measurement_copy_block(struct cmg_entry *buf, - struct channel_subsystem *css, - struct chp_id chpid) +static ssize_t chp_measurement_copy_block(void *buf, loff_t off, size_t count, + struct kobject *kobj, bool extended) { - void *area; - struct cmg_entry *entry, reference_buf; - int idx; + struct channel_path *chp; + struct channel_subsystem *css; + struct device *device; + unsigned int size; + void *area, *entry; + int id, idx; - if (chpid.id < CSS_CUES_PER_PAGE) { - area = css->cub[0]; - idx = chpid.id; + device = kobj_to_dev(kobj); + chp = to_channelpath(device); + css = to_css(chp->dev.parent); + id = chp->chpid.id; + + if (extended) { + /* Check if extended measurement data is available. */ + if (!chp->extended) + return 0; + + size = sizeof(struct cmg_ext_entry); + area = css->ecub[id / CSS_ECUES_PER_PAGE]; + idx = id % CSS_ECUES_PER_PAGE; } else { - area = css->cub[1]; - idx = chpid.id - CSS_CUES_PER_PAGE; + size = sizeof(struct cmg_entry); + area = css->cub[id / CSS_CUES_PER_PAGE]; + idx = id % CSS_CUES_PER_PAGE; } - entry = area + (idx * sizeof(struct cmg_entry)); - do { - memcpy(buf, entry, sizeof(*entry)); - memcpy(&reference_buf, entry, sizeof(*entry)); - } while (reference_buf.values[0] != buf->values[0]); + entry = area + (idx * size); + + /* Only allow single reads. */ + if (off || count < size) + return 0; + + memcpy(buf, entry, size); + + return size; } static ssize_t measurement_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { - struct channel_path *chp; - struct channel_subsystem *css; - struct device *device; - unsigned int size; - - device = kobj_to_dev(kobj); - chp = to_channelpath(device); - css = to_css(chp->dev.parent); - - size = sizeof(struct cmg_entry); - - /* Only allow single reads. */ - if (off || count < size) - return 0; - chp_measurement_copy_block((struct cmg_entry *)buf, css, chp->chpid); - count = size; - return count; + return chp_measurement_copy_block(buf, off, count, kobj, false); } static BIN_ATTR_ADMIN_RO(measurement, sizeof(struct cmg_entry)); +static ssize_t ext_measurement_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + return chp_measurement_copy_block(buf, off, count, kobj, true); +} +static BIN_ATTR_ADMIN_RO(ext_measurement, sizeof(struct cmg_ext_entry)); + static struct bin_attribute *measurement_attrs[] = { &bin_attr_measurement_chars, &bin_attr_measurement, + &bin_attr_ext_measurement, NULL, }; BIN_ATTRIBUTE_GROUPS(measurement); diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h index 7ee9eba0abcb..1241033ccd62 100644 --- a/drivers/s390/cio/chp.h +++ b/drivers/s390/cio/chp.h @@ -51,6 +51,7 @@ struct channel_path { /* Channel-measurement related stuff: */ int cmg; int shared; + int extended; struct cmg_chars cmg_chars; }; diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 3344fa996ec4..f2f4c7e19048 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -871,11 +871,14 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable) struct { struct chsc_header request; u32 operation_code : 2; - u32 : 30; + u32 : 1; + u32 e : 1; + u32 : 28; u32 key : 4; u32 : 28; dma64_t cub[CSS_NUM_CUB_PAGES]; - u32 reserved[13]; + dma64_t ecub[CSS_NUM_ECUB_PAGES]; + u32 reserved[5]; struct chsc_header response; u32 status : 8; u32 : 4; @@ -892,9 +895,12 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable) secm_area->request.code = 0x0016; secm_area->key = PAGE_DEFAULT_KEY >> 4; + secm_area->e = 1; for (i = 0; i < CSS_NUM_CUB_PAGES; i++) secm_area->cub[i] = (__force dma64_t)virt_to_dma32(css->cub[i]); + for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) + secm_area->ecub[i] = virt_to_dma64(css->ecub[i]); secm_area->operation_code = enable ? 0 : 1; @@ -929,6 +935,11 @@ static int cub_alloc(struct channel_subsystem *css) if (!css->cub[i]) return -ENOMEM; } + for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) { + css->ecub[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!css->ecub[i]) + return -ENOMEM; + } return 0; } @@ -941,6 +952,10 @@ static void cub_free(struct channel_subsystem *css) free_page((unsigned long)css->cub[i]); css->cub[i] = NULL; } + for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) { + free_page((unsigned long)css->ecub[i]); + css->ecub[i] = NULL; + } } int @@ -1067,7 +1082,8 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) u32 zeroes2; u32 not_valid : 1; u32 shared : 1; - u32 : 22; + u32 extended : 1; + u32 : 21; u32 chpid : 8; u32 cmcv : 5; u32 : 11; @@ -1079,6 +1095,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) chp->shared = -1; chp->cmg = -1; + chp->extended = 0; if (!css_chsc_characteristics.scmc || !css_chsc_characteristics.secm) return -EINVAL; @@ -1108,6 +1125,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) chp->cmg = scmc_area->cmg; chp->shared = scmc_area->shared; + chp->extended = scmc_area->extended; if (chp->cmg != 2 && chp->cmg != 3) { /* No cmg-dependent data. */ goto out; diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index 03602295f335..24cd65dbc5a7 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -22,6 +22,11 @@ struct cmg_entry { u32 values[NR_MEASUREMENT_ENTRIES]; }; +#define NR_EXT_MEASUREMENT_ENTRIES 16 +struct cmg_ext_entry { + u32 values[NR_EXT_MEASUREMENT_ENTRIES]; +}; + struct channel_path_desc_fmt1 { u8 flags; u8 lsn; diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h index aca11dacb224..c2b175592bb7 100644 --- a/drivers/s390/cio/css.h +++ b/drivers/s390/cio/css.h @@ -40,6 +40,8 @@ #define CSS_NUM_CUB_PAGES 2 #define CSS_CUES_PER_PAGE 128 +#define CSS_NUM_ECUB_PAGES 4 +#define CSS_ECUES_PER_PAGE 64 /* * Conditions used to specify which subchannels need evaluation @@ -130,6 +132,7 @@ struct channel_subsystem { /* channel measurement related */ int cm_enabled; void *cub[CSS_NUM_CUB_PAGES]; + void *ecub[CSS_NUM_ECUB_PAGES]; /* for orphaned ccw devices */ struct subchannel *pseudo_subchannel; }; From 5e6bb10ee523d85d688aacaa13405d651ad16214 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 26 Mar 2024 17:03:23 +0100 Subject: [PATCH 16/54] s390/cio: export measurement data for all CMGs A channel-path's channel-measurement-group value (CMG) determines the format of associated measurement data and characteristics blocks. Both blocks are of fixed size and contain a generic and CMG-dependent part. Currently CIO exports these data blocks via sysfs only for a specific list of CMGs even though the kernel itself does not interpret CMG-dependent data. Change CIO to export measurement data and characteristics for all CMGs. This enables supporting new CMG data formats in userspace without the need for kernel changes. Reviewed-by: Vineeth Vijayan Acked-by: Heiko Carstens Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/chsc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index f2f4c7e19048..bfc663889fbd 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1126,10 +1126,6 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) chp->cmg = scmc_area->cmg; chp->shared = scmc_area->shared; chp->extended = scmc_area->extended; - if (chp->cmg != 2 && chp->cmg != 3) { - /* No cmg-dependent data. */ - goto out; - } chsc_initialize_cmg_chars(chp, scmc_area->cmcv, (struct cmg_chars *) &scmc_area->data); out: From 0f987e6caa3c93f36e277624234880459848fa34 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 26 Mar 2024 17:03:24 +0100 Subject: [PATCH 17/54] s390/cio: export CHPID operating speed Add a per-CHPID sysfs attribute named "speed_bps" that provides the operating speed of the associated channel path in bits per second, or 0 if the operating speed is not available. Example: $ cat /sys/devices/css0/chp0.32/speed_bps 32G Reviewed-by: Vineeth Vijayan Acked-by: Heiko Carstens Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/chp.c | 30 ++++++++++++++++++++++++++++++ drivers/s390/cio/chp.h | 1 + drivers/s390/cio/chsc.c | 20 ++++++++++++++++++-- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index fd7d34ed6ea9..a07bbecba61c 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -392,6 +392,35 @@ static ssize_t chp_esc_show(struct device *dev, } static DEVICE_ATTR(esc, 0444, chp_esc_show, NULL); +static char apply_max_suffix(unsigned long *value, unsigned long base) +{ + static char suffixes[] = { 0, 'K', 'M', 'G', 'T' }; + int i; + + for (i = 0; i < ARRAY_SIZE(suffixes) - 1; i++) { + if (*value < base || *value % base != 0) + break; + *value /= base; + } + + return suffixes[i]; +} + +static ssize_t speed_bps_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct channel_path *chp = to_channelpath(dev); + unsigned long speed = chp->speed; + char suffix; + + suffix = apply_max_suffix(&speed, 1000); + + return suffix ? sysfs_emit(buf, "%lu%c\n", speed, suffix) : + sysfs_emit(buf, "%lu\n", speed); +} + +static DEVICE_ATTR_RO(speed_bps); + static ssize_t util_string_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) @@ -423,6 +452,7 @@ static struct attribute *chp_attrs[] = { &dev_attr_chid.attr, &dev_attr_chid_external.attr, &dev_attr_esc.attr, + &dev_attr_speed_bps.attr, NULL, }; static struct attribute_group chp_attr_group = { diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h index 1241033ccd62..a15324a43aa3 100644 --- a/drivers/s390/cio/chp.h +++ b/drivers/s390/cio/chp.h @@ -52,6 +52,7 @@ struct channel_path { int cmg; int shared; int extended; + unsigned long speed; struct cmg_chars cmg_chars; }; diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index bfc663889fbd..dcc1e1c34ca2 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1066,6 +1066,18 @@ chsc_initialize_cmg_chars(struct channel_path *chp, u8 cmcv, } } +static unsigned long scmc_get_speed(u32 s, u32 p) +{ + unsigned long speed = s; + + if (!p) + p = 8; + while (p--) + speed *= 10; + + return speed; +} + int chsc_get_channel_measurement_chars(struct channel_path *chp) { unsigned long flags; @@ -1086,16 +1098,19 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) u32 : 21; u32 chpid : 8; u32 cmcv : 5; - u32 : 11; + u32 : 7; + u32 cmgp : 4; u32 cmgq : 8; u32 cmg : 8; - u32 zeroes3; + u32 : 16; + u32 cmgs : 16; u32 data[NR_MEASUREMENT_CHARS]; } *scmc_area; chp->shared = -1; chp->cmg = -1; chp->extended = 0; + chp->speed = 0; if (!css_chsc_characteristics.scmc || !css_chsc_characteristics.secm) return -EINVAL; @@ -1126,6 +1141,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp) chp->cmg = scmc_area->cmg; chp->shared = scmc_area->shared; chp->extended = scmc_area->extended; + chp->speed = scmc_get_speed(scmc_area->cmgs, scmc_area->cmgp); chsc_initialize_cmg_chars(chp, scmc_area->cmcv, (struct cmg_chars *) &scmc_area->data); out: From 8692a24d0fae19f674d51726d179ad04ba95d958 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 26 Mar 2024 17:04:56 +0100 Subject: [PATCH 18/54] s390/cio: fix tracepoint subchannel type field The subchannel-type field "st" of s390_cio_stsch and s390_cio_msch tracepoints is incorrectly filled with the subchannel-enabled SCHIB value "ena". Fix this by assigning the correct value. Fixes: d1de8633d96a ("s390 cio: Rewrite trace point class s390_class_schib") Reviewed-by: Heiko Carstens Signed-off-by: Peter Oberparleiter Signed-off-by: Alexander Gordeev --- drivers/s390/cio/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/trace.h b/drivers/s390/cio/trace.h index 86993de25345..a4c5c6736b31 100644 --- a/drivers/s390/cio/trace.h +++ b/drivers/s390/cio/trace.h @@ -50,7 +50,7 @@ DECLARE_EVENT_CLASS(s390_class_schib, __entry->devno = schib->pmcw.dev; __entry->schib = *schib; __entry->pmcw_ena = schib->pmcw.ena; - __entry->pmcw_st = schib->pmcw.ena; + __entry->pmcw_st = schib->pmcw.st; __entry->pmcw_dnv = schib->pmcw.dnv; __entry->pmcw_dev = schib->pmcw.dev; __entry->pmcw_lpm = schib->pmcw.lpm; From 22fdd8ba61187582843f090f100284d9e826adca Mon Sep 17 00:00:00 2001 From: Nina Schoetterl-Glausch Date: Tue, 19 Mar 2024 17:44:20 +0100 Subject: [PATCH 19/54] KVM: s390: vsie: Use virt_to_phys for facility control block In order for SIE to interpretively execute STFLE, it requires the real or absolute address of a facility-list control block. Before writing the location into the shadow SIE control block, convert it from a virtual address. We currently do not run into this bug because the lower 31 bits are the same for virtual and physical addresses. Signed-off-by: Nina Schoetterl-Glausch Link: https://lore.kernel.org/r/20240319164420.4053380-3-nsg@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20240319164420.4053380-3-nsg@linux.ibm.com> Signed-off-by: Alexander Gordeev --- arch/s390/kvm/vsie.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b2c9f010f0fe..d8527a046cf7 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -1005,7 +1006,7 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (read_guest_real(vcpu, fac, &vsie_page->fac, stfle_size() * sizeof(u64))) return set_validity_icpt(scb_s, 0x1090U); - scb_s->fac = (__u32)(__u64) &vsie_page->fac; + scb_s->fac = (u32)virt_to_phys(&vsie_page->fac); } return 0; } From 47bf81767277b5abb87f7e86e15310f0e9d4d06c Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 11 Jul 2023 12:21:39 +0200 Subject: [PATCH 20/54] s390/boot: Do not force vmemmap to start at MAX_PHYSMEM_BITS vmemmap is forcefully set to start at MAX_PHYSMEM_BITS at most. That could be needed in the past to limit ident_map_size to MAX_PHYSMEM_BITS. However since commit 75eba6ec0de1 ("s390: unify identity mapping limits handling") ident_map_size is limited in setup_ident_map_size() function, which is called earlier. Another reason to limit vmemmap start to MAX_PHYSMEM_BITS is because it was returned by arch_get_mappable_range() as the maximum mappable physical address. Since commit f641679dfe55 ("s390/mm: rework arch_get_mappable_range() callback") that is not required anymore. As result, there is no neccessity to limit vmemmap starting address with MAX_PHYSMEM_BITS. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 6cf89314209a..bb5d26f0ff41 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -313,9 +313,8 @@ static unsigned long setup_kernel_memory_layout(void) pages = SECTION_ALIGN_UP(pages); /* keep vmemmap_start aligned to a top level region table entry */ vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); - vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); - /* maximum mappable address as seen by arch_get_mappable_range() */ - max_mappable = vmemmap_start; + /* maximum address for which linear mapping could be created (DCSS, memory) */ + max_mappable = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); /* make sure identity map doesn't overlay with vmemmap */ ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); From b2b15f079c4c19e3a73a8b0cf4f901186f12098f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 1 Mar 2024 07:03:49 +0100 Subject: [PATCH 21/54] s390/boot: Consider DCSS segments on memory layout setup The maximum mappable physical address (as returned by arch_get_mappable_range() callback) is limited by the value of (1UL << MAX_PHYSMEM_BITS). The maximum physical address available to a DCSS segment is 512GB. In case the available online or offline memory size is less than the DCSS limit arch_get_mappable_range() would include never used [512GB..(1UL << MAX_PHYSMEM_BITS)] range. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 7 +++++-- arch/s390/include/asm/extmem.h | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index bb5d26f0ff41..eee742b88cc8 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -313,14 +314,16 @@ static unsigned long setup_kernel_memory_layout(void) pages = SECTION_ALIGN_UP(pages); /* keep vmemmap_start aligned to a top level region table entry */ vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); - /* maximum address for which linear mapping could be created (DCSS, memory) */ - max_mappable = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); /* make sure identity map doesn't overlay with vmemmap */ ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); /* make sure vmemmap doesn't overlay with vmalloc area */ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); vmemmap = (struct page *)vmemmap_start; + /* maximum address for which linear mapping could be created (DCSS, memory) */ + BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); + max_mappable = max(ident_map_size, MAX_DCSS_ADDR); + max_mappable = min(max_mappable, vmemmap_start); return asce_limit; } diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h index 568fd81bb77b..e0a06060afdd 100644 --- a/arch/s390/include/asm/extmem.h +++ b/arch/s390/include/asm/extmem.h @@ -8,6 +8,13 @@ #define _ASM_S390X_DCSS_H #ifndef __ASSEMBLY__ +/* + * DCSS segment is defined as a contiguous range of pages using DEFSEG command. + * The range start and end is a page number with a value less than or equal to + * 0x7ffffff (see CP Commands and Utilities Reference). + */ +#define MAX_DCSS_ADDR (512UL * SZ_1G) + /* possible values for segment type as returned by segment_info */ #define SEG_TYPE_SW 0 #define SEG_TYPE_EW 1 From ecf74da64defe9e7f1862d86b4f3d4041e22dc4a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 11 Jul 2023 07:58:24 +0200 Subject: [PATCH 22/54] s390/boot: Reduce size of identity mapping on overlap In case vmemmap array could overlap with vmalloc area on virtual memory layout setup, the size of vmalloc area is decreased. That could result in less memory than user requested with vmalloc= kernel command line parameter. Instead, reduce the size of identity mapping (and the size of vmemmap array as result) to avoid such overlap. Further, currently the virtual memmory allocation "rolls" from top to bottom and it is only VMALLOC_START that could get increased due to the overlap. Change that to decrease- only, which makes the whole allocation algorithm more easy to comprehend. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index eee742b88cc8..069ecc81332f 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -318,7 +318,10 @@ static unsigned long setup_kernel_memory_layout(void) ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); /* make sure vmemmap doesn't overlay with vmalloc area */ - VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); + if (vmemmap_start + vmemmap_size > VMALLOC_START) { + vmemmap_size = SECTION_ALIGN_DOWN(ident_map_size / PAGE_SIZE) * sizeof(struct page); + ident_map_size = vmemmap_size / sizeof(struct page) * PAGE_SIZE; + } vmemmap = (struct page *)vmemmap_start; /* maximum address for which linear mapping could be created (DCSS, memory) */ BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); From c8aef260c86ec86c4d6065b6cd67ce7161d1ca10 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 1 Mar 2024 07:05:39 +0100 Subject: [PATCH 23/54] s390/boot: Swap vmalloc and Lowcore/Real Memory Copy areas This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. Currently the order of virtual memory areas is (the lowcore and .amode31 section are skipped, as it is irrelevant): identity mapping (the kernel is contained within) vmemmap vmalloc modules Absolute Lowcore Real Memory Copy In the future the kernel will be mapped separately and placed to the end of the virtual address space, so the layout would turn like this: identity mapping vmemmap vmalloc modules Absolute Lowcore Real Memory Copy kernel However, the distance between kernel and modules needs to be as little as possible, ideally - none. Thus, the Absolute Lowcore and Real Memory Copy areas would stay in the way and therefore need to be moved as well: identity mapping vmemmap Absolute Lowcore Real Memory Copy vmalloc modules kernel To facilitate such layout swap the vmalloc and Absolute Lowcore together with Real Memory Copy areas. As result, the current layout turns into: identity mapping (the kernel is contained within) vmemmap Absolute Lowcore Real Memory Copy vmalloc modules This will allow to locate the kernel directly next to the modules once it gets mapped separately. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 20 +++++++++++--------- arch/s390/mm/vmem.c | 3 ++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 069ecc81332f..ebd8d8dc3bea 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -297,28 +297,30 @@ static unsigned long setup_kernel_memory_layout(void) /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif - __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE); - __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, - sizeof(struct lowcore)); - MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); + MODULES_END = round_down(vmax, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ - vsize = round_down(VMALLOC_END / 2, _SEGMENT_SIZE); + vsize = (VMALLOC_END - (MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE)) / 2; + vsize = round_down(vsize, _SEGMENT_SIZE); vmalloc_size = min(vmalloc_size, vsize); VMALLOC_START = VMALLOC_END - vmalloc_size; + __memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE); + __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, + sizeof(struct lowcore)); + /* split remaining virtual space between 1:1 mapping & vmemmap array */ - pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + pages = __abs_lowcore / (PAGE_SIZE + sizeof(struct page)); pages = SECTION_ALIGN_UP(pages); /* keep vmemmap_start aligned to a top level region table entry */ - vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); + vmemmap_start = round_down(__abs_lowcore - pages * sizeof(struct page), rte_size); /* make sure identity map doesn't overlay with vmemmap */ ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); - /* make sure vmemmap doesn't overlay with vmalloc area */ - if (vmemmap_start + vmemmap_size > VMALLOC_START) { + /* make sure vmemmap doesn't overlay with absolute lowcore area */ + if (vmemmap_start + vmemmap_size > __abs_lowcore) { vmemmap_size = SECTION_ALIGN_DOWN(ident_map_size / PAGE_SIZE) * sizeof(struct page); ident_map_size = vmemmap_size / sizeof(struct page) * PAGE_SIZE; } diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 85cddf904cb2..917b8a37383a 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -436,7 +437,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (WARN_ON_ONCE(end > VMALLOC_START)) + if (WARN_ON_ONCE(end > __abs_lowcore)) return -EINVAL; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); From bbe72f39022270c40dc1e991b4dadf6f32eed86a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sat, 2 Dec 2023 08:50:45 +0100 Subject: [PATCH 24/54] s390/mm: Move KASLR related to Move everyting KASLR related to , similarly to many other architectures. Acked-by: Heiko Carstens Suggested-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/page.h | 14 ++++++++++++++ arch/s390/include/asm/setup.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 9381879f7ecf..602e8056b7cc 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -178,6 +178,20 @@ int arch_make_page_accessible(struct page *page); #define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE #endif +extern unsigned long __kaslr_offset; +static inline unsigned long kaslr_offset(void) +{ + return __kaslr_offset; +} + +extern int __kaslr_enabled; +static inline int kaslr_enabled(void) +{ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return __kaslr_enabled; + return 0; +} + #define __PAGE_OFFSET 0x0UL #define PAGE_OFFSET 0x0UL diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 03bcaa8effb2..32f70873e2b7 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -127,20 +127,6 @@ extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); -extern unsigned long __kaslr_offset; -static inline unsigned long kaslr_offset(void) -{ - return __kaslr_offset; -} - -extern int __kaslr_enabled; -static inline int kaslr_enabled(void) -{ - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return __kaslr_enabled; - return 0; -} - struct oldmem_data { unsigned long start; unsigned long size; From 236f324b747370b97030c9582591f459353e3589 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sat, 2 Dec 2023 10:57:15 +0100 Subject: [PATCH 25/54] s390/mm: Create virtual memory layout structure This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. Put virtual memory layout information into a structure to improve code generation when accessing the structure members, which are currently only ident_map_size and __kaslr_offset. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 3 +-- arch/s390/include/asm/page.h | 11 ++++++++++- arch/s390/kernel/setup.c | 3 +-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index ebd8d8dc3bea..8afb232d7c96 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -19,7 +19,7 @@ #include "boot.h" #include "uv.h" -unsigned long __bootdata_preserved(__kaslr_offset); +struct vm_layout __bootdata_preserved(vm_layout); unsigned long __bootdata_preserved(__abs_lowcore); unsigned long __bootdata_preserved(__memcpy_real_area); pte_t *__bootdata_preserved(memcpy_real_ptep); @@ -30,7 +30,6 @@ unsigned long __bootdata_preserved(vmemmap_size); unsigned long __bootdata_preserved(MODULES_VADDR); unsigned long __bootdata_preserved(MODULES_END); unsigned long __bootdata_preserved(max_mappable); -unsigned long __bootdata(ident_map_size); u64 __bootdata_preserved(stfle_fac_list[16]); u64 __bootdata_preserved(alt_stfle_fac_list[16]); diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 602e8056b7cc..f13a4527bf2d 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -178,7 +178,16 @@ int arch_make_page_accessible(struct page *page); #define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE #endif -extern unsigned long __kaslr_offset; +struct vm_layout { + unsigned long kaslr_offset; + unsigned long identity_size; +}; + +extern struct vm_layout vm_layout; + +#define __kaslr_offset vm_layout.kaslr_offset +#define ident_map_size vm_layout.identity_size + static inline unsigned long kaslr_offset(void) { return __kaslr_offset; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 24ed33f044ec..b9d70869bc36 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -146,10 +146,9 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; unsigned long __bootdata_preserved(max_mappable); -unsigned long __bootdata(ident_map_size); struct physmem_info __bootdata(physmem_info); -unsigned long __bootdata_preserved(__kaslr_offset); +struct vm_layout __bootdata_preserved(vm_layout); int __bootdata_preserved(__kaslr_enabled); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); From 3bb11234b1d17236ba479f7d0eaa9bd12f2f2493 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 20 Feb 2024 14:35:43 +0100 Subject: [PATCH 26/54] s390/boot: Uncouple virtual and physical kernel offsets This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. Currently __kaslr_offset is the kernel offset in both physical memory on boot and in virtual memory after DAT mode is enabled. Uncouple these offsets and rename the physical address space variant to __kaslr_offset_phys while keep the name __kaslr_offset for the offset in virtual address space. Do not use __kaslr_offset_phys after DAT mode is enabled just yet, but still make it a persistent boot variable for later use. Use __kaslr_offset and __kaslr_offset_phys offsets in proper contexts and alter handle_relocs() function to distinguish between the two. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 3 +++ arch/s390/boot/pgm_check_info.c | 4 +++- arch/s390/boot/startup.c | 20 ++++++++++++-------- arch/s390/include/asm/page.h | 2 ++ 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b9857aacd40b..9515d4bf1683 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -17,6 +17,9 @@ config ARCH_HAS_ILOG2_U32 config ARCH_HAS_ILOG2_U64 def_bool n +config ARCH_PROC_KCORE_TEXT + def_bool y + config GENERIC_HWEIGHT def_bool y diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c index 97244cd7a206..ea96275b0380 100644 --- a/arch/s390/boot/pgm_check_info.c +++ b/arch/s390/boot/pgm_check_info.c @@ -153,8 +153,10 @@ void print_pgm_check_info(void) decompressor_printk("Kernel command line: %s\n", early_command_line); decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n", S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1); - if (kaslr_enabled()) + if (kaslr_enabled()) { decompressor_printk("Kernel random base: %lx\n", __kaslr_offset); + decompressor_printk("Kernel random base phys: %lx\n", __kaslr_offset_phys); + } decompressor_printk("PSW : %016lx %016lx (%pS)\n", S390_lowcore.psw_save_area.mask, S390_lowcore.psw_save_area.addr, diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 8afb232d7c96..44ef08518f37 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -142,7 +142,8 @@ static void copy_bootdata(void) } #ifdef CONFIG_PIE_BUILD -static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset) +static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, + unsigned long offset, unsigned long phys_offset) { Elf64_Rela *rela_start, *rela_end, *rela; int r_type, r_sym, rc; @@ -153,7 +154,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; dynsym = (Elf64_Sym *) vmlinux.dynsym_start; for (rela = rela_start; rela < rela_end; rela++) { - loc = rela->r_offset + offset; + loc = rela->r_offset + phys_offset; val = rela->r_addend; r_sym = ELF64_R_SYM(rela->r_info); if (r_sym) { @@ -194,7 +195,8 @@ static void free_relocs(void) physmem_free(RR_RELOC); } -static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset) +static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, + unsigned long offset, unsigned long phys_offset) { int *reloc; long loc; @@ -428,8 +430,9 @@ void startup_kernel(void) THREAD_SIZE, vmlinux.default_lma, ident_map_size); if (vmlinux_lma) { - __kaslr_offset = vmlinux_lma - vmlinux.default_lma; - kaslr_adjust_vmlinux_info(__kaslr_offset); + __kaslr_offset_phys = vmlinux_lma - vmlinux.default_lma; + kaslr_adjust_vmlinux_info(__kaslr_offset_phys); + __kaslr_offset = __kaslr_offset_phys; } } vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma; @@ -438,7 +441,7 @@ void startup_kernel(void) if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { img = decompress_kernel(); memmove((void *)vmlinux_lma, img, vmlinux.image_size); - } else if (__kaslr_offset) { + } else if (__kaslr_offset_phys) { img = (void *)vmlinux.default_lma; memmove((void *)vmlinux_lma, img, vmlinux.image_size); memset(img, 0, vmlinux.image_size); @@ -465,7 +468,8 @@ void startup_kernel(void) * to bootdata made by setup_vmem() */ clear_bss_section(vmlinux_lma); - kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, __kaslr_offset); + kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, + __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); free_relocs(); setup_vmem(asce_limit); @@ -475,7 +479,7 @@ void startup_kernel(void) * Save KASLR offset for early dumps, before vmcore_info is set. * Mark as uneven to distinguish from real vmcore_info pointer. */ - S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0; + S390_lowcore.vmcore_info = __kaslr_offset_phys ? __kaslr_offset_phys | 0x1UL : 0; /* * Jump to the decompressed kernel entry point and switch DAT mode on. diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index f13a4527bf2d..e2137b6ec2ed 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -180,12 +180,14 @@ int arch_make_page_accessible(struct page *page); struct vm_layout { unsigned long kaslr_offset; + unsigned long kaslr_offset_phys; unsigned long identity_size; }; extern struct vm_layout vm_layout; #define __kaslr_offset vm_layout.kaslr_offset +#define __kaslr_offset_phys vm_layout.kaslr_offset_phys #define ident_map_size vm_layout.identity_size static inline unsigned long kaslr_offset(void) From 7de0446f0b26589fa80e384d8edaa2c279583652 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 10 Aug 2023 21:40:19 +0200 Subject: [PATCH 27/54] s390/boot: Make identity mapping base address explicit This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. Currently the identity mapping base address is implicit and is always set to zero. Make it explicit by putting into __identity_base persistent boot variable and use it in proper context - which is the value of PAGE_OFFSET. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/page.h | 6 ++++-- arch/s390/kernel/setup.c | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index e2137b6ec2ed..95403750d25a 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -181,6 +181,7 @@ int arch_make_page_accessible(struct page *page); struct vm_layout { unsigned long kaslr_offset; unsigned long kaslr_offset_phys; + unsigned long identity_base; unsigned long identity_size; }; @@ -188,6 +189,7 @@ extern struct vm_layout vm_layout; #define __kaslr_offset vm_layout.kaslr_offset #define __kaslr_offset_phys vm_layout.kaslr_offset_phys +#define __identity_base vm_layout.identity_base #define ident_map_size vm_layout.identity_size static inline unsigned long kaslr_offset(void) @@ -203,8 +205,8 @@ static inline int kaslr_enabled(void) return 0; } -#define __PAGE_OFFSET 0x0UL -#define PAGE_OFFSET 0x0UL +#define __PAGE_OFFSET __identity_base +#define PAGE_OFFSET __PAGE_OFFSET #define __pa_nodebug(x) ((unsigned long)(x)) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b9d70869bc36..0544830ffb90 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -149,6 +149,7 @@ unsigned long __bootdata_preserved(max_mappable); struct physmem_info __bootdata(physmem_info); struct vm_layout __bootdata_preserved(vm_layout); +EXPORT_SYMBOL_GPL(vm_layout); int __bootdata_preserved(__kaslr_enabled); unsigned int __bootdata_preserved(zlib_dfltcc_support); EXPORT_SYMBOL(zlib_dfltcc_support); From 5fb50fa66ab94141c0692dc5043ed30e6159a81b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 11 Aug 2023 09:49:27 +0200 Subject: [PATCH 28/54] s390/boot: Make .amode31 section address range explicit This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. Introduce .amode31 section address range AMODE31_START and AMODE31_END macros for later use. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/physmem_info.h | 3 +++ arch/s390/kernel/setup.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h index e747b067f8db..d2f1cc742695 100644 --- a/arch/s390/include/asm/physmem_info.h +++ b/arch/s390/include/asm/physmem_info.h @@ -170,4 +170,7 @@ static inline unsigned long get_physmem_reserved(enum reserved_range_type type, return *size; } +#define AMODE31_START (physmem_info.reserved[RR_AMODE31].start) +#define AMODE31_END (physmem_info.reserved[RR_AMODE31].end) + #endif diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 0544830ffb90..cbd5290939df 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -765,7 +765,7 @@ static void __init relocate_amode31_section(void) unsigned long amode31_size = __eamode31 - __samode31; long amode31_offset, *ptr; - amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31; + amode31_offset = AMODE31_START - (unsigned long)__samode31; pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size); /* Move original AMODE31 section to the new one */ From 88702793c5b4ef127a1f57d76920a80f70d081a7 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 27 Jan 2022 14:28:49 +0100 Subject: [PATCH 29/54] s390/os_info: Introduce value entries Introduce entries that do not reference any data in memory, but rather provide values. Set the size of such entries to zero and do not compute checksum for them, since there is no data which integrity needs to be checked. The integrity of the value entries itself is still covered by the os_info checksum. Reserve the lowest unused entry index OS_INFO_RESERVED for future use - presumably for the number of entries present. That could later be used by user level tools. The existing tools would not notice any difference. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/os_info.h | 19 +++++++++++++++---- arch/s390/kernel/ipl.c | 6 +++--- arch/s390/kernel/os_info.c | 15 +++++++++++++-- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index a4d2e103f116..d2f4ba67c006 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -17,11 +17,15 @@ #define OS_INFO_VMCOREINFO 0 #define OS_INFO_REIPL_BLOCK 1 #define OS_INFO_FLAGS_ENTRY 2 +#define OS_INFO_RESERVED 3 #define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) struct os_info_entry { - u64 addr; + union { + u64 addr; + u64 val; + }; u64 size; u32 csum; } __packed; @@ -33,17 +37,24 @@ struct os_info { u16 version_minor; u64 crashkernel_addr; u64 crashkernel_size; - struct os_info_entry entry[3]; - u8 reserved[4004]; + struct os_info_entry entry[4]; + u8 reserved[3984]; } __packed; void os_info_init(void); -void os_info_entry_add(int nr, void *ptr, u64 len); +void os_info_entry_add_data(int nr, void *ptr, u64 len); +void os_info_entry_add_val(int nr, u64 val); void os_info_crashkernel_add(unsigned long base, unsigned long size); u32 os_info_csum(struct os_info *os_info); #ifdef CONFIG_CRASH_DUMP void *os_info_old_entry(int nr, unsigned long *size); +static inline unsigned long os_info_old_value(int nr) +{ + unsigned long size; + + return (unsigned long)os_info_old_entry(nr, &size); +} #else static inline void *os_info_old_entry(int nr, unsigned long *size) { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 1486350a4177..7dc54571f18e 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1209,8 +1209,8 @@ static struct attribute_group reipl_nss_attr_group = { void set_os_info_reipl_block(void) { - os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual, - reipl_block_actual->hdr.len); + os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual, + reipl_block_actual->hdr.len); } /* reipl type */ @@ -1940,7 +1940,7 @@ static void dump_reipl_run(struct shutdown_trigger *trigger) reipl_type == IPL_TYPE_NSS || reipl_type == IPL_TYPE_UNKNOWN) os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR; - os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); + os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags)); csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0); abs_lc = get_abs_lowcore(); abs_lc->ipib = __pa(reipl_block_actual); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index a801e6bd5341..3800824f8466 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -43,9 +43,9 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size) } /* - * Add OS info entry and update checksum + * Add OS info data entry and update checksum */ -void os_info_entry_add(int nr, void *ptr, u64 size) +void os_info_entry_add_data(int nr, void *ptr, u64 size) { os_info.entry[nr].addr = __pa(ptr); os_info.entry[nr].size = size; @@ -53,6 +53,17 @@ void os_info_entry_add(int nr, void *ptr, u64 size) os_info.csum = os_info_csum(&os_info); } +/* + * Add OS info value entry and update checksum + */ +void os_info_entry_add_val(int nr, u64 value) +{ + os_info.entry[nr].val = value; + os_info.entry[nr].size = 0; + os_info.entry[nr].csum = 0; + os_info.csum = os_info_csum(&os_info); +} + /* * Initialize OS info structure and set lowcore pointer */ From 8572f52518f69842d983b45eefa7d4efccd233de Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 11 Aug 2023 10:07:31 +0200 Subject: [PATCH 30/54] s390/os_info: Store virtual memory layout This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. The virtual memory layout will be read out by makedumpfile, crash and other user tools for virtual address translation. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/os_info.h | 10 ++++++++-- arch/s390/kernel/os_info.c | 7 +++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index d2f4ba67c006..621d49aa9c8d 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -18,6 +18,12 @@ #define OS_INFO_REIPL_BLOCK 1 #define OS_INFO_FLAGS_ENTRY 2 #define OS_INFO_RESERVED 3 +#define OS_INFO_IDENTITY_BASE 4 +#define OS_INFO_KASLR_OFFSET 5 +#define OS_INFO_KASLR_OFF_PHYS 6 +#define OS_INFO_VMEMMAP 7 +#define OS_INFO_AMODE31_START 8 +#define OS_INFO_AMODE31_END 9 #define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) @@ -37,8 +43,8 @@ struct os_info { u16 version_minor; u64 crashkernel_addr; u64 crashkernel_size; - struct os_info_entry entry[4]; - u8 reserved[3984]; + struct os_info_entry entry[10]; + u8 reserved[3864]; } __packed; void os_info_init(void); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 3800824f8466..25bcda341630 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -74,6 +75,12 @@ void __init os_info_init(void) os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; + os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base); + os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset()); + os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys); + os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap); + os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START); + os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END); os_info.csum = os_info_csum(&os_info); abs_lc = get_abs_lowcore(); abs_lc->os_info = __pa(&os_info); From 378e32aa81971e8f5594372c6a9d75aa3cf52c99 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 11 Aug 2023 10:10:53 +0200 Subject: [PATCH 31/54] s390/vmcoreinfo: Store virtual memory layout This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. The virtual memory layout is needed for address translation by crash tool when /proc/kcore device is used as the memory image. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/vmcore_info.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c index d296dfc22191..23f7d7619a99 100644 --- a/arch/s390/kernel/vmcore_info.c +++ b/arch/s390/kernel/vmcore_info.c @@ -14,7 +14,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31); vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31); + vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys); abs_lc = get_abs_lowcore(); abs_lc->vmcore_info = paddr_vmcoreinfo_note(); put_abs_lowcore(abs_lc); From f4cac27dc0d6ba9640c0ce1c42749cfa086cdfb2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sat, 30 Sep 2023 10:37:49 +0200 Subject: [PATCH 32/54] s390/crash: Use old os_info to create PT_LOAD headers This is a preparatory rework to allow uncoupling virtual and physical addresses spaces. The vmcore ELF program headers describe virtual memory regions of a crashed kernel. User level tools use that information for the kernel text and data analysis (e.g vmcore-dmesg extracts the kernel log). Currently the kernel image is covered by program headers describing the identity mapping regions. But in the future the kernel image will be mapped into separate region outside of the identity mapping. Create the additional ELF program header that covers kernel image only, so that vmcore tools could locate kernel text and data. Further, the identity mapping in crashed and capture kernels will have different base address. Due to that __va() macro can not be used in the capture kernel. Instead, read crashed kernel identity mapping base address from os_info and use it for PT_LOAD type program headers creation. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/os_info.h | 3 +++ arch/s390/kernel/crash_dump.c | 41 +++++++++++++++++++++++++++++---- arch/s390/kernel/os_info.c | 3 +++ 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index 621d49aa9c8d..dea2b37b635e 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -24,6 +24,9 @@ #define OS_INFO_VMEMMAP 7 #define OS_INFO_AMODE31_START 8 #define OS_INFO_AMODE31_END 9 +#define OS_INFO_IMAGE_START 10 +#define OS_INFO_IMAGE_END 11 +#define OS_INFO_IMAGE_PHYS 12 #define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d09ebb6f5262..9863ebe75019 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -465,7 +465,11 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) ehdr->e_phoff = sizeof(Elf64_Ehdr); ehdr->e_ehsize = sizeof(Elf64_Ehdr); ehdr->e_phentsize = sizeof(Elf64_Phdr); - ehdr->e_phnum = mem_chunk_cnt + 1; + /* + * Number of memory chunk PT_LOAD program headers plus one kernel + * image PT_LOAD program header plus one PT_NOTE program header. + */ + ehdr->e_phnum = mem_chunk_cnt + 1 + 1; return ehdr + 1; } @@ -501,15 +505,16 @@ static int get_mem_chunk_cnt(void) */ static void loads_init(Elf64_Phdr *phdr) { + unsigned long old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE); phys_addr_t start, end; u64 idx; for_each_physmem_range(idx, &oldmem_type, &start, &end) { - phdr->p_filesz = end - start; phdr->p_type = PT_LOAD; + phdr->p_vaddr = old_identity_base + start; phdr->p_offset = start; - phdr->p_vaddr = (unsigned long)__va(start); phdr->p_paddr = start; + phdr->p_filesz = end - start; phdr->p_memsz = end - start; phdr->p_flags = PF_R | PF_W | PF_X; phdr->p_align = PAGE_SIZE; @@ -517,6 +522,25 @@ static void loads_init(Elf64_Phdr *phdr) } } +/* + * Prepare PT_LOAD type program header for kernel image region + */ +static void text_init(Elf64_Phdr *phdr) +{ + unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS); + unsigned long start = os_info_old_value(OS_INFO_IMAGE_START); + unsigned long end = os_info_old_value(OS_INFO_IMAGE_END); + + phdr->p_type = PT_LOAD; + phdr->p_vaddr = start; + phdr->p_filesz = end - start; + phdr->p_memsz = end - start; + phdr->p_offset = start_phys; + phdr->p_paddr = start_phys; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; +} + /* * Initialize notes (new kernel) */ @@ -557,6 +581,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) size += nt_vmcoreinfo_size(); /* nt_final */ size += sizeof(Elf64_Nhdr); + /* PT_LOAD type program header for kernel text region */ + size += sizeof(Elf64_Phdr); /* PT_LOADS */ size += mem_chunk_cnt * sizeof(Elf64_Phdr); @@ -568,7 +594,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt) */ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { - Elf64_Phdr *phdr_notes, *phdr_loads; + Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text; size_t alloc_size; int mem_chunk_cnt; void *ptr, *hdr; @@ -606,14 +632,19 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) /* Init program headers */ phdr_notes = ptr; ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); + phdr_text = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); phdr_loads = ptr; ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); /* Init notes */ hdr_off = PTR_DIFF(ptr, hdr); ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init kernel text program header */ + text_init(phdr_text); /* Init loads */ - hdr_off = PTR_DIFF(ptr, hdr); loads_init(phdr_loads); + /* Finalize program headers */ + hdr_off = PTR_DIFF(ptr, hdr); *addr = (unsigned long long) hdr; *size = (unsigned long long) hdr_off; BUG_ON(elfcorehdr_size > alloc_size); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 25bcda341630..3f4dc045a894 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -81,6 +81,9 @@ void __init os_info_init(void) os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap); os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START); os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END); + os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext); + os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end); + os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext)); os_info.csum = os_info_csum(&os_info); abs_lc = get_abs_lowcore(); abs_lc->os_info = __pa(&os_info); From c98d2ecae08f02bd2dccd24e7e485e9f0211db65 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 1 Mar 2024 07:15:22 +0100 Subject: [PATCH 33/54] s390/mm: Uncouple physical vs virtual address spaces The uncoupling physical vs virtual address spaces brings the following benefits to s390: - virtual memory layout flexibility; - closes the address gap between kernel and modules, it caused s390-only problems in the past (e.g. 'perf' bugs); - allows getting rid of trampolines used for module calls into kernel; - allows simplifying BPF trampoline; - minor performance improvement in branch prediction; - kernel randomization entropy is magnitude bigger, as it is derived from the amount of available virtual, not physical memory; The whole change could be described in two pictures below: before and after the change. Some aspects of the virtual memory layout setup are not clarified (number of page levels, alignment, DMA memory), since these are not a part of this change or secondary with regard to how the uncoupling itself is implemented. The focus of the pictures is to explain why __va() and __pa() macros are implemented the way they are. Memory layout in V==R mode: | Physical | Virtual | +- 0 --------------+- 0 --------------+ identity mapping start | | S390_lowcore | Low-address memory | +- 8 KB -----------+ | | | | | identity | phys == virt | | mapping | virt == phys | | | +- AMODE31_START --+- AMODE31_START --+ .amode31 rand. phys/virt start |.amode31 text/data|.amode31 text/data| +- AMODE31_END ----+- AMODE31_END ----+ .amode31 rand. phys/virt start | | | | | | +- __kaslr_offset, __kaslr_offset_phys| kernel rand. phys/virt start | | | | kernel text/data | kernel text/data | phys == kvirt | | | +------------------+------------------+ kernel phys/virt end | | | | | | | | | | | | +- ident_map_size -+- ident_map_size -+ identity mapping end | | | ... unused gap | | | +---- vmemmap -----+ 'struct page' array start | | | virtually mapped | | memory map | | | +- __abs_lowcore --+ | | | Absolute Lowcore | | | +- __memcpy_real_area | | | Real Memory Copy| | | +- VMALLOC_START --+ vmalloc area start | | | vmalloc area | | | +- MODULES_VADDR --+ modules area start | | | modules area | | | +------------------+ UltraVisor Secure Storage limit | | | ... unused gap | | | +KASAN_SHADOW_START+ KASAN shadow memory start | | | KASAN shadow | | | +------------------+ ASCE limit Memory layout in V!=R mode: | Physical | Virtual | +- 0 --------------+- 0 --------------+ | | S390_lowcore | Low-address memory | +- 8 KB -----------+ | | | | | | | | ... unused gap | | | | +- AMODE31_START --+- AMODE31_START --+ .amode31 rand. phys/virt start |.amode31 text/data|.amode31 text/data| +- AMODE31_END ----+- AMODE31_END ----+ .amode31 rand. phys/virt end (<2GB) | | | | | | +- __kaslr_offset_phys | kernel rand. phys start | | | | kernel text/data | | | | | +------------------+ | kernel phys end | | | | | | | | | | | | +- ident_map_size -+ | | | | ... unused gap | | | +- __identity_base + identity mapping start (>= 2GB) | | | identity | phys == virt - __identity_base | mapping | virt == phys + __identity_base | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +---- vmemmap -----+ 'struct page' array start | | | virtually mapped | | memory map | | | +- __abs_lowcore --+ | | | Absolute Lowcore | | | +- __memcpy_real_area | | | Real Memory Copy| | | +- VMALLOC_START --+ vmalloc area start | | | vmalloc area | | | +- MODULES_VADDR --+ modules area start | | | modules area | | | +- __kaslr_offset -+ kernel rand. virt start | | | kernel text/data | phys == (kvirt - __kaslr_offset) + | | __kaslr_offset_phys +- kernel .bss end + kernel rand. virt end | | | ... unused gap | | | +------------------+ UltraVisor Secure Storage limit | | | ... unused gap | | | +KASAN_SHADOW_START+ KASAN shadow memory start | | | KASAN shadow | | | +------------------+ ASCE limit Unused gaps in the virtual memory layout could be present or not - depending on how partucular system is configured. No page tables are created for the unused gaps. The relative order of vmalloc, modules and kernel image in virtual memory is defined by following considerations: - start of the modules area and end of the kernel should reside within 4GB to accommodate relative 32-bit jumps. The best way to achieve that is to place kernel next to modules; - vmalloc and module areas should locate next to each other to prevent failures and extra reworks in user level tools (makedumpfile, crash, etc.) which treat vmalloc and module addresses similarily; - kernel needs to be the last area in the virtual memory layout to easily distinguish between kernel and non-kernel virtual addresses. That is needed to (again) simplify handling of addresses in user level tools and make __pa() macro faster (see below); Concluding the above, the relative order of the considered virtual areas in memory is: vmalloc - modules - kernel. Therefore, the only change to the current memory layout is moving kernel to the end of virtual address space. With that approach the implementation of __pa() macro is straightforward - all linear virtual addresses less than kernel base are considered identity mapping: phys == virt - __identity_base All addresses greater than kernel base are kernel ones: phys == (kvirt - __kaslr_offset) + __kaslr_offset_phys By contrast, __va() macro deals only with identity mapping addresses: virt == phys + __identity_base .amode31 section is mapped separately and is not covered by __pa() macro. In fact, it could have been handled easily by checking whether a virtual address is within the section or not, but there is no need for that. Thus, let __pa() code do as little machine cycles as possible. The KASAN shadow memory is located at the very end of the virtual memory layout, at addresses higher than the kernel. However, that is not a linear mapping and no code other than KASAN instrumentation or API is expected to access it. When KASLR mode is enabled the kernel base address randomized within a memory window that spans whole unused virtual address space. The size of that window depends from the amount of physical memory available to the system, the limit imposed by UltraVisor (if present) and the vmalloc area size as provided by vmalloc= kernel command line parameter. In case the virtual memory is exhausted the minimum size of the randomization window is forcefully set to 2GB, which amounts to in 15 bits of entropy if KASAN is enabled or 17 bits of entropy in default configuration. The default kernel offset 0x100000 is used as a magic value both in the decompressor code and vmlinux linker script, but it will be removed with a follow-up change. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- Documentation/arch/s390/index.rst | 1 + Documentation/arch/s390/mm.rst | 111 ++++++++++++++++++++++++++++++ arch/s390/boot/boot.h | 7 +- arch/s390/boot/kaslr.c | 2 +- arch/s390/boot/startup.c | 55 +++++++++++---- arch/s390/boot/vmem.c | 111 ++++++++++++++++-------------- arch/s390/include/asm/page.h | 14 ++-- arch/s390/include/asm/pgtable.h | 6 ++ arch/s390/mm/vmem.c | 2 + 9 files changed, 241 insertions(+), 68 deletions(-) create mode 100644 Documentation/arch/s390/mm.rst diff --git a/Documentation/arch/s390/index.rst b/Documentation/arch/s390/index.rst index 73c79bf586fd..e75a6e5d2505 100644 --- a/Documentation/arch/s390/index.rst +++ b/Documentation/arch/s390/index.rst @@ -8,6 +8,7 @@ s390 Architecture cds 3270 driver-model + mm monreader qeth s390dbf diff --git a/Documentation/arch/s390/mm.rst b/Documentation/arch/s390/mm.rst new file mode 100644 index 000000000000..084adad5eef9 --- /dev/null +++ b/Documentation/arch/s390/mm.rst @@ -0,0 +1,111 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Memory Management +================= + +Virtual memory layout +===================== + +.. note:: + + - Some aspects of the virtual memory layout setup are not + clarified (number of page levels, alignment, DMA memory). + + - Unused gaps in the virtual memory layout could be present + or not - depending on how partucular system is configured. + No page tables are created for the unused gaps. + + - The virtual memory regions are tracked or untracked by KASAN + instrumentation, as well as the KASAN shadow memory itself is + created only when CONFIG_KASAN configuration option is enabled. + +:: + + ============================================================================= + | Physical | Virtual | VM area description + ============================================================================= + +- 0 --------------+- 0 --------------+ + | | S390_lowcore | Low-address memory + | +- 8 KB -----------+ + | | | + | | | + | | ... unused gap | KASAN untracked + | | | + +- AMODE31_START --+- AMODE31_START --+ .amode31 rand. phys/virt start + |.amode31 text/data|.amode31 text/data| KASAN untracked + +- AMODE31_END ----+- AMODE31_END ----+ .amode31 rand. phys/virt end (<2GB) + | | | + | | | + +- __kaslr_offset_phys | kernel rand. phys start + | | | + | kernel text/data | | + | | | + +------------------+ | kernel phys end + | | | + | | | + | | | + | | | + +- ident_map_size -+ | + | | + | ... unused gap | KASAN untracked + | | + +- __identity_base + identity mapping start (>= 2GB) + | | + | identity | phys == virt - __identity_base + | mapping | virt == phys + __identity_base + | | + | | KASAN tracked + | | + | | + | | + | | + | | + | | + | | + | | + | | + | | + | | + | | + | | + | | + | | + +---- vmemmap -----+ 'struct page' array start + | | + | virtually mapped | + | memory map | KASAN untracked + | | + +- __abs_lowcore --+ + | | + | Absolute Lowcore | KASAN untracked + | | + +- __memcpy_real_area + | | + | Real Memory Copy| KASAN untracked + | | + +- VMALLOC_START --+ vmalloc area start + | | KASAN untracked or + | vmalloc area | KASAN shallowly populated in case + | | CONFIG_KASAN_VMALLOC=y + +- MODULES_VADDR --+ modules area start + | | KASAN allocated per module or + | modules area | KASAN shallowly populated in case + | | CONFIG_KASAN_VMALLOC=y + +- __kaslr_offset -+ kernel rand. virt start + | | KASAN tracked + | kernel text/data | phys == (kvirt - __kaslr_offset) + + | | __kaslr_offset_phys + +- kernel .bss end + kernel rand. virt end + | | + | ... unused gap | KASAN untracked + | | + +------------------+ UltraVisor Secure Storage limit + | | + | ... unused gap | KASAN untracked + | | + +KASAN_SHADOW_START+ KASAN shadow memory start + | | + | KASAN shadow | KASAN untracked + | | + +------------------+ ASCE limit diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 567d60f78bbc..412cdd490403 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -74,10 +74,11 @@ void sclp_early_setup_buffer(void); void print_pgm_check_info(void); unsigned long randomize_within_range(unsigned long size, unsigned long align, unsigned long min, unsigned long max); -void setup_vmem(unsigned long asce_limit); +void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit); void __printf(1, 2) decompressor_printk(const char *fmt, ...); void print_stacktrace(unsigned long sp); void error(char *m); +int get_random(unsigned long limit, unsigned long *value); extern struct machine_info machine; @@ -98,6 +99,10 @@ extern struct vmlinux_info _vmlinux_info; #define vmlinux _vmlinux_info #define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore)) +#define __kernel_va(x) ((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset)) +#define __kernel_pa(x) ((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys) +#define __identity_va(x) ((void *)((unsigned long)(x) + __identity_base)) +#define __identity_pa(x) ((unsigned long)(x) - __identity_base) static inline bool intersects(unsigned long addr0, unsigned long size0, unsigned long addr1, unsigned long size1) diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index 90602101e2ae..bd3bf5ef472d 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -43,7 +43,7 @@ static int check_prng(void) return PRNG_MODE_TDES; } -static int get_random(unsigned long limit, unsigned long *value) +int get_random(unsigned long limit, unsigned long *value) { struct prng_parm prng = { /* initial parameter block for tdes mode, copied from libica */ diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 44ef08518f37..3efa1f457451 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -203,7 +203,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, /* Adjust R_390_64 relocations */ for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) { - loc = (long)*reloc + offset; + loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) error("64-bit relocation outside of kernel!\n"); *(u64 *)loc += offset; @@ -263,8 +263,25 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #endif } -static unsigned long setup_kernel_memory_layout(void) +#define FIXMAP_SIZE round_up(MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)) + +static unsigned long get_vmem_size(unsigned long identity_size, + unsigned long vmemmap_size, + unsigned long vmalloc_size, + unsigned long rte_size) { + unsigned long max_mappable, vsize; + + max_mappable = max(identity_size, MAX_DCSS_ADDR); + vsize = round_up(SZ_2G + max_mappable, rte_size) + + round_up(vmemmap_size, rte_size) + + FIXMAP_SIZE + MODULES_LEN + KASLR_LEN; + return size_add(vsize, vmalloc_size); +} + +static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) +{ + unsigned long kernel_start, kernel_end; unsigned long vmemmap_start; unsigned long asce_limit; unsigned long rte_size; @@ -277,12 +294,11 @@ static unsigned long setup_kernel_memory_layout(void) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + - MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE; - vsize = size_add(vsize, vmalloc_size); + vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; + vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE); } else { asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; @@ -298,12 +314,26 @@ static unsigned long setup_kernel_memory_layout(void) /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif - MODULES_END = round_down(vmax, _SEGMENT_SIZE); + kernel_end = vmax; + if (kaslr_enabled()) { + unsigned long kaslr_len, slots, pos; + + vsize = min(vsize, vmax); + kaslr_len = max(KASLR_LEN, vmax - vsize); + slots = DIV_ROUND_UP(kaslr_len - kernel_size, THREAD_SIZE); + if (get_random(slots, &pos)) + pos = 0; + kernel_end -= pos * THREAD_SIZE; + } + kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE); + __kaslr_offset = kernel_start; + + MODULES_END = round_down(kernel_start, _SEGMENT_SIZE); MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ - vsize = (VMALLOC_END - (MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE)) / 2; + vsize = (VMALLOC_END - FIXMAP_SIZE) / 2; vsize = round_down(vsize, _SEGMENT_SIZE); vmalloc_size = min(vmalloc_size, vsize); VMALLOC_START = VMALLOC_END - vmalloc_size; @@ -330,6 +360,7 @@ static unsigned long setup_kernel_memory_layout(void) BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS)); max_mappable = max(ident_map_size, MAX_DCSS_ADDR); max_mappable = min(max_mappable, vmemmap_start); + __identity_base = round_down(vmemmap_start - max_mappable, rte_size); return asce_limit; } @@ -358,7 +389,6 @@ static void setup_vmalloc_size(void) static void kaslr_adjust_vmlinux_info(unsigned long offset) { - *(unsigned long *)(&vmlinux.entry) += offset; vmlinux.bootdata_off += offset; vmlinux.bootdata_preserved_off += offset; #ifdef CONFIG_PIE_BUILD @@ -386,6 +416,7 @@ void startup_kernel(void) unsigned long max_physmem_end; unsigned long vmlinux_lma = 0; unsigned long amode31_lma = 0; + unsigned long kernel_size; unsigned long asce_limit; unsigned long safe_addr; void *img; @@ -417,7 +448,8 @@ void startup_kernel(void) max_physmem_end = detect_max_physmem_end(); setup_ident_map_size(max_physmem_end); setup_vmalloc_size(); - asce_limit = setup_kernel_memory_layout(); + kernel_size = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + asce_limit = setup_kernel_memory_layout(kernel_size); /* got final ident_map_size, physmem allocations could be performed now */ physmem_set_usable_limit(ident_map_size); detect_physmem_online_ranges(max_physmem_end); @@ -432,7 +464,6 @@ void startup_kernel(void) if (vmlinux_lma) { __kaslr_offset_phys = vmlinux_lma - vmlinux.default_lma; kaslr_adjust_vmlinux_info(__kaslr_offset_phys); - __kaslr_offset = __kaslr_offset_phys; } } vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma; @@ -472,7 +503,7 @@ void startup_kernel(void) __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); free_relocs(); - setup_vmem(asce_limit); + setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); copy_bootdata(); /* @@ -484,7 +515,7 @@ void startup_kernel(void) /* * Jump to the decompressed kernel entry point and switch DAT mode on. */ - psw.addr = vmlinux.entry; + psw.addr = __kaslr_offset + vmlinux.entry; psw.mask = PSW_KERNEL_BITS; __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 09b10bb6e4d0..86edd432e593 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -27,6 +27,8 @@ enum populate_mode { POPULATE_NONE, POPULATE_DIRECT, POPULATE_ABS_LOWCORE, + POPULATE_IDENTITY, + POPULATE_KERNEL, #ifdef CONFIG_KASAN POPULATE_KASAN_MAP_SHADOW, POPULATE_KASAN_ZERO_SHADOW, @@ -54,7 +56,7 @@ static inline void kasan_populate(unsigned long start, unsigned long end, enum p pgtable_populate(start, end, mode); } -static void kasan_populate_shadow(void) +static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end) { pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); @@ -76,44 +78,20 @@ static void kasan_populate_shadow(void) __arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER); __arch_set_page_dat(kasan_early_shadow_pte, 1); - /* - * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * |1:1 ident mapping| /|1/8 of ident map| - * | | / | | - * +-end of ident map+ / +----------------+ - * | ... gap ... | / | kasan | - * | | / | zero page | - * +- vmalloc area -+ / | mapping | - * | vmalloc_size | / | (untracked) | - * +- modules vaddr -+ / +----------------+ - * | 2Gb |/ | unmapped | allocated per module - * +- shadow start -+ +----------------+ - * | 1/8 addr space | | zero pg mapping| (untracked) - * +- shadow end ----+---------+- shadow end ---+ - * - * Current memory layout (KASAN_VMALLOC): - * +- 0 -------------+ +- shadow start -+ - * |1:1 ident mapping| /|1/8 of ident map| - * | | / | | - * +-end of ident map+ / +----------------+ - * | ... gap ... | / | kasan zero page| (untracked) - * | | / | mapping | - * +- vmalloc area -+ / +----------------+ - * | vmalloc_size | / |shallow populate| - * +- modules vaddr -+ / +----------------+ - * | 2Gb |/ |shallow populate| - * +- shadow start -+ +----------------+ - * | 1/8 addr space | | zero pg mapping| (untracked) - * +- shadow end ----+---------+- shadow end ---+ - */ - for_each_physmem_usable_range(i, &start, &end) { - kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW); - if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) - kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW); + kasan_populate((unsigned long)__identity_va(start), + (unsigned long)__identity_va(end), + POPULATE_KASAN_MAP_SHADOW); + if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) { + kasan_populate((unsigned long)__identity_va(memgap_start), + (unsigned long)__identity_va(start), + POPULATE_KASAN_ZERO_SHADOW); + } memgap_start = end; } + kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW); + kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { untracked_end = VMALLOC_START; /* shallowly populate kasan shadow for vmalloc and modules */ @@ -122,8 +100,9 @@ static void kasan_populate_shadow(void) untracked_end = MODULES_VADDR; } /* populate kasan shadow for untracked memory */ - kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW); - kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW); + kasan_populate((unsigned long)__identity_va(ident_map_size), untracked_end, + POPULATE_KASAN_ZERO_SHADOW); + kasan_populate(kernel_end, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW); } static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, @@ -180,7 +159,9 @@ static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode) } #else -static inline void kasan_populate_shadow(void) {} +static inline void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end) +{ +} static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr, unsigned long end, enum populate_mode mode) @@ -263,6 +244,10 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m return addr; case POPULATE_ABS_LOWCORE: return __abs_lowcore_pa(addr); + case POPULATE_KERNEL: + return __kernel_pa(addr); + case POPULATE_IDENTITY: + return __identity_pa(addr); #ifdef CONFIG_KASAN case POPULATE_KASAN_MAP_SHADOW: addr = physmem_alloc_top_down(RR_VMEM, size, size); @@ -274,15 +259,22 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m } } -static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +static bool large_allowed(enum populate_mode mode) { - return machine.has_edat2 && + return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY); +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, + enum populate_mode mode) +{ + return machine.has_edat2 && large_allowed(mode) && IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; } -static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end, + enum populate_mode mode) { - return machine.has_edat1 && + return machine.has_edat1 && large_allowed(mode) && IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; } @@ -322,7 +314,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e if (pmd_none(*pmd)) { if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode)) continue; - if (can_large_pmd(pmd, addr, next)) { + if (can_large_pmd(pmd, addr, next, mode)) { entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode)); entry = set_pmd_bit(entry, SEGMENT_KERNEL); if (!machine.has_nx) @@ -355,7 +347,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e if (pud_none(*pud)) { if (kasan_pud_populate_zero_shadow(pud, addr, next, mode)) continue; - if (can_large_pud(pud, addr, next)) { + if (can_large_pud(pud, addr, next, mode)) { entry = __pud(_pa(addr, _REGION3_SIZE, mode)); entry = set_pud_bit(entry, REGION3_KERNEL); if (!machine.has_nx) @@ -418,11 +410,12 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat } } -void setup_vmem(unsigned long asce_limit) +void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit) { unsigned long start, end; unsigned long asce_type; unsigned long asce_bits; + pgd_t *init_mm_pgd; int i; /* @@ -433,6 +426,15 @@ void setup_vmem(unsigned long asce_limit) for_each_physmem_online_range(i, &start, &end) __arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT); + /* + * init_mm->pgd contains virtual address of swapper_pg_dir. + * It is unusable at this stage since DAT is yet off. Swap + * it for physical address of swapper_pg_dir and restore + * the virtual address after all page tables are created. + */ + init_mm_pgd = init_mm.pgd; + init_mm.pgd = (pgd_t *)swapper_pg_dir; + if (asce_limit == _REGION1_SIZE) { asce_type = _REGION2_ENTRY_EMPTY; asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; @@ -451,17 +453,25 @@ void setup_vmem(unsigned long asce_limit) * To allow prefixing the lowcore must be mapped with 4KB pages. * To prevent creation of a large page at address 0 first map * the lowcore and create the identity mapping only afterwards. + * + * Skip 0x100000 bytes for kernel pgtables, as per the linker script: + * . = 0x100000; */ pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT); - for_each_physmem_usable_range(i, &start, &end) - pgtable_populate(start, end, POPULATE_DIRECT); + for_each_physmem_usable_range(i, &start, &end) { + pgtable_populate((unsigned long)__identity_va(start), + (unsigned long)__identity_va(end), + POPULATE_IDENTITY); + } + pgtable_populate(kernel_start + 0x100000, kernel_end, POPULATE_KERNEL); + pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT); pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), POPULATE_ABS_LOWCORE); pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE, POPULATE_NONE); - memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area); + memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area)); - kasan_populate_shadow(); + kasan_populate_shadow(kernel_start + 0x100000, kernel_end); S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits; S390_lowcore.user_asce = s390_invalid_asce; @@ -471,4 +481,5 @@ void setup_vmem(unsigned long asce_limit) local_ctl_load(13, &S390_lowcore.kernel_asce); init_mm.context.asce = S390_lowcore.kernel_asce.val; + init_mm.pgd = init_mm_pgd; } diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 95403750d25a..b39d724ab6f6 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -208,16 +208,22 @@ static inline int kaslr_enabled(void) #define __PAGE_OFFSET __identity_base #define PAGE_OFFSET __PAGE_OFFSET -#define __pa_nodebug(x) ((unsigned long)(x)) - #ifdef __DECOMPRESSOR +#define __pa_nodebug(x) ((unsigned long)(x)) #define __pa(x) __pa_nodebug(x) #define __pa32(x) __pa(x) #define __va(x) ((void *)(unsigned long)(x)) #else /* __DECOMPRESSOR */ +static inline unsigned long __pa_nodebug(unsigned long x) +{ + if (x < __kaslr_offset) + return x - __identity_base; + return x - __kaslr_offset + __kaslr_offset_phys; +} + #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x, bool is_31bit); @@ -233,7 +239,7 @@ static inline unsigned long __phys_addr(unsigned long x, bool is_31bit) #define __pa(x) __phys_addr((unsigned long)(x), false) #define __pa32(x) __phys_addr((unsigned long)(x), true) -#define __va(x) ((void *)(unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x) + __identity_base)) #endif /* __DECOMPRESSOR */ @@ -258,7 +264,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) -#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug(kaddr))) +#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr)))) #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 60950e7a25f5..0f6321ff427b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -107,6 +107,12 @@ static inline int is_module_addr(void *addr) return 1; } +#ifdef CONFIG_RANDOMIZE_BASE +#define KASLR_LEN (1UL << 31) +#else +#define KASLR_LEN 0UL +#endif + /* * A 64 bit pagetable entry of S390 has following format: * | PFRA |0IPC| OS | diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 917b8a37383a..41c714e21292 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include static DEFINE_MUTEX(vmem_mutex); From 54f2ecc3188f78723267826f634e0747169f8685 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 26 Sep 2023 15:58:51 +0200 Subject: [PATCH 34/54] s390: Map kernel at fixed location when KASLR is disabled Since kernel virtual and physical address spaces are uncoupled the kernel is mapped at the top of the virtual address space in case KASLR is disabled. That does not pose any issue with regard to the kernel booting and operation, but makes it difficult to use a generated vmlinux with some debugging tools (e.g. gdb), because the exact location of the kernel image in virtual memory is unknown. Make that location known and introduce CONFIG_KERNEL_IMAGE_BASE configuration option. A custom CONFIG_KERNEL_IMAGE_BASE value that would break the virtual memory layout leads to a build error. The kernel image size is defined by KERNEL_IMAGE_SIZE macro and set to 512 MB, by analogy with x86. Suggested-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- .../admin-guide/kernel-parameters.txt | 4 ++- arch/s390/Kconfig | 19 +++++++++++ arch/s390/boot/startup.c | 33 ++++++++++++++----- arch/s390/include/asm/page.h | 4 +++ arch/s390/tools/relocs.c | 2 +- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..b74741e0a053 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4780,7 +4780,9 @@ prot_virt= [S390] enable hosting protected virtual machines isolated from the hypervisor (if hardware supports - that). + that). If enabled, the default kernel base address + might be overridden even when Kernel Address Space + Layout Randomization is disabled. Format: psi= [KNL] Enable or disable pressure stall information diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9515d4bf1683..df6b371ed214 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -614,6 +614,25 @@ config RANDOMIZE_BASE as a security feature that deters exploit attempts relying on knowledge of the location of kernel internals. +config KERNEL_IMAGE_BASE + hex "Kernel image base address" + range 0x100000 0x1FFFFFE0000000 if !KASAN + range 0x100000 0x1BFFFFE0000000 if KASAN + default 0x3FFE0000000 if !KASAN + default 0x7FFFE0000000 if KASAN + help + This is the address at which the kernel image is loaded in case + Kernel Address Space Layout Randomization (KASLR) is disabled. + + In case the Protected virtualization guest support is enabled the + Ultravisor imposes a virtual address limit. If the value of this + option leads to the kernel image exceeding the Ultravisor limit, + this option is ignored and the image is loaded below the limit. + + If the value of this option leads to the kernel image overlapping + the virtual memory where other data structures are located, this + option is ignored and the image is loaded above the structures. + endmenu menu "Memory setup" diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 3efa1f457451..fc9c1d51ef8d 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -281,8 +281,8 @@ static unsigned long get_vmem_size(unsigned long identity_size, static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) { - unsigned long kernel_start, kernel_end; unsigned long vmemmap_start; + unsigned long kernel_start; unsigned long asce_limit; unsigned long rte_size; unsigned long pages; @@ -294,11 +294,18 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ + BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); + BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); - if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { + if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE || + (vsize > _REGION2_SIZE && kaslr_enabled())) { asce_limit = _REGION1_SIZE; - rte_size = _REGION2_SIZE; - vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE); + if (__NO_KASLR_END_KERNEL > _REGION2_SIZE) { + rte_size = _REGION2_SIZE; + vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE); + } else { + rte_size = _REGION3_SIZE; + } } else { asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; @@ -308,24 +315,32 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) * Forcing modules and vmalloc area under the ultravisor * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. + * + * Assume the secure storage limit always exceeds _REGION2_SIZE, + * otherwise asce_limit and rte_size would have been adjusted. */ vmax = adjust_to_uv_max(asce_limit); #ifdef CONFIG_KASAN + BUILD_BUG_ON(__NO_KASLR_END_KERNEL > KASAN_SHADOW_START); /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif - kernel_end = vmax; + vsize = min(vsize, vmax); if (kaslr_enabled()) { - unsigned long kaslr_len, slots, pos; + unsigned long kernel_end, kaslr_len, slots, pos; - vsize = min(vsize, vmax); kaslr_len = max(KASLR_LEN, vmax - vsize); slots = DIV_ROUND_UP(kaslr_len - kernel_size, THREAD_SIZE); if (get_random(slots, &pos)) pos = 0; - kernel_end -= pos * THREAD_SIZE; + kernel_end = vmax - pos * THREAD_SIZE; + kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE); + } else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) { + kernel_start = round_down(vmax - kernel_size, THREAD_SIZE); + decompressor_printk("The kernel base address is forced to %lx\n", kernel_start); + } else { + kernel_start = __NO_KASLR_START_KERNEL; } - kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE); __kaslr_offset = kernel_start; MODULES_END = round_down(kernel_start, _SEGMENT_SIZE); diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index b39d724ab6f6..e1b6355ddbef 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -273,4 +273,8 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #include #include +#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE +#define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c index 30a732c808f3..b4f35506779b 100644 --- a/arch/s390/tools/relocs.c +++ b/arch/s390/tools/relocs.c @@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel) case R_390_GOTOFF64: break; case R_390_64: - add_reloc(&relocs64, offset); + add_reloc(&relocs64, offset - (ehdr.e_entry - 0x100000)); break; default: die("Unsupported relocation type: %d\n", r_type); From 56b1069c40c777e9cba595a62857293628067d65 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 22 Mar 2024 14:39:57 +0100 Subject: [PATCH 35/54] s390/boot: Rework deployment of the kernel image Rework deployment of kernel image for both compressed and uncompressed variants as defined by CONFIG_KERNEL_UNCOMPRESSED kernel configuration variable. In case CONFIG_KERNEL_UNCOMPRESSED is disabled avoid uncompressing the kernel to a temporary buffer and copying it to the target address. Instead, uncompress it directly to the target destination. In case CONFIG_KERNEL_UNCOMPRESSED is enabled avoid moving the kernel to default 0x100000 location when KASLR is disabled or failed. Instead, use the uncompressed kernel image directly. In case KASLR is disabled or failed .amode31 section location in memory is not randomized and precedes the kernel image. In case CONFIG_KERNEL_UNCOMPRESSED is disabled that location overlaps the area used by the decompression algorithm. That is fine, since that area is not used after the decompression finished and the size of .amode31 section is not expected to exceed BOOT_HEAP_SIZE ever. There is no decompression in case CONFIG_KERNEL_UNCOMPRESSED is enabled. Therefore, rename decompress_kernel() to deploy_kernel(), which better describes both uncompressed and compressed cases. Introduce AMODE31_SIZE macro to avoid immediate value of 0x3000 (the size of .amode31 section) in the decompressor linker script. Modify the vmlinux linker script to force the size of .amode31 section to AMODE31_SIZE (the value of (_eamode31 - _samode31) could otherwise differ as result of compiler options used). Introduce __START_KERNEL macro that defines the kernel ELF image entry point and set it to the currrent value of 0x100000. Signed-off-by: Alexander Gordeev --- arch/s390/boot/boot.h | 1 - arch/s390/boot/decompressor.c | 15 +----- arch/s390/boot/decompressor.h | 8 ++- arch/s390/boot/startup.c | 90 ++++++++++++++++++---------------- arch/s390/boot/vmem.c | 7 +-- arch/s390/boot/vmlinux.lds.S | 3 +- arch/s390/include/asm/page.h | 3 ++ arch/s390/kernel/vmlinux.lds.S | 5 +- arch/s390/tools/relocs.c | 2 +- 9 files changed, 62 insertions(+), 72 deletions(-) diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 412cdd490403..85da1c6cef4f 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -17,7 +17,6 @@ struct machine_info { }; struct vmlinux_info { - unsigned long default_lma; unsigned long entry; unsigned long image_size; /* does not include .bss */ unsigned long bss_size; /* uncompressed image .bss size */ diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index d762733a0753..f478e8e9cbda 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -63,24 +63,13 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE; #include "../../../../lib/decompress_unzstd.c" #endif -#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE) - unsigned long mem_safe_offset(void) { - /* - * due to 4MB HEAD_SIZE for bzip2 - * 'decompress_offset + vmlinux.image_size' could be larger than - * kernel at final position + its .bss, so take the larger of two - */ - return max(decompress_offset + vmlinux.image_size, - vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size); + return ALIGN(free_mem_end_ptr, PAGE_SIZE); } -void *decompress_kernel(void) +void deploy_kernel(void *output) { - void *output = (void *)decompress_offset; - __decompress(_compressed_start, _compressed_end - _compressed_start, NULL, NULL, output, vmlinux.image_size, NULL, error); - return output; } diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h index 92b81d2ea35d..4f966f06bd65 100644 --- a/arch/s390/boot/decompressor.h +++ b/arch/s390/boot/decompressor.h @@ -2,11 +2,9 @@ #ifndef BOOT_COMPRESSED_DECOMPRESSOR_H #define BOOT_COMPRESSED_DECOMPRESSOR_H -#ifdef CONFIG_KERNEL_UNCOMPRESSED -static inline void *decompress_kernel(void) { return NULL; } -#else -void *decompress_kernel(void); -#endif +#ifndef CONFIG_KERNEL_UNCOMPRESSED unsigned long mem_safe_offset(void); +void deploy_kernel(void *output); +#endif #endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */ diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index fc9c1d51ef8d..949eff7107cc 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -109,9 +109,19 @@ static void setup_lpp(void) } #ifdef CONFIG_KERNEL_UNCOMPRESSED -unsigned long mem_safe_offset(void) +static unsigned long mem_safe_offset(void) { - return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + return (unsigned long)_compressed_start; +} + +static void deploy_kernel(void *output) +{ + void *uncompressed_start = (void *)_compressed_start; + + if (output == uncompressed_start) + return; + memmove(output, uncompressed_start, vmlinux.image_size); + memset(uncompressed_start, 0, vmlinux.image_size); } #endif @@ -154,18 +164,18 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; dynsym = (Elf64_Sym *) vmlinux.dynsym_start; for (rela = rela_start; rela < rela_end; rela++) { - loc = rela->r_offset + phys_offset; + loc = rela->r_offset + phys_offset - __START_KERNEL; val = rela->r_addend; r_sym = ELF64_R_SYM(rela->r_info); if (r_sym) { if (dynsym[r_sym].st_shndx != SHN_UNDEF) - val += dynsym[r_sym].st_value + offset; + val += dynsym[r_sym].st_value + offset - __START_KERNEL; } else { /* - * 0 == undefined symbol table index (STN_UNDEF), + * 0 == undefined symbol table index (SHN_UNDEF), * used for R_390_RELATIVE, only add KASLR offset */ - val += offset; + val += offset - __START_KERNEL; } r_type = ELF64_R_TYPE(rela->r_info); rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); @@ -206,7 +216,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) error("64-bit relocation outside of kernel!\n"); - *(u64 *)loc += offset; + *(u64 *)loc += offset - __START_KERNEL; } } @@ -219,7 +229,7 @@ static void kaslr_adjust_got(unsigned long offset) * reason. Adjust the GOT entries. */ for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) - *entry += offset; + *entry += offset - __START_KERNEL; } #endif @@ -294,6 +304,7 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ + BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE)); BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE); vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE); @@ -383,9 +394,9 @@ static unsigned long setup_kernel_memory_layout(unsigned long kernel_size) /* * This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's. */ -static void clear_bss_section(unsigned long vmlinux_lma) +static void clear_bss_section(unsigned long kernel_start) { - memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size); + memset((void *)kernel_start + vmlinux.image_size, 0, vmlinux.bss_size); } /* @@ -402,7 +413,7 @@ static void setup_vmalloc_size(void) vmalloc_size = max(size, vmalloc_size); } -static void kaslr_adjust_vmlinux_info(unsigned long offset) +static void kaslr_adjust_vmlinux_info(long offset) { vmlinux.bootdata_off += offset; vmlinux.bootdata_preserved_off += offset; @@ -426,24 +437,30 @@ static void kaslr_adjust_vmlinux_info(unsigned long offset) #endif } +static void fixup_vmlinux_info(void) +{ + vmlinux.entry -= __START_KERNEL; + kaslr_adjust_vmlinux_info(-__START_KERNEL); +} + void startup_kernel(void) { - unsigned long max_physmem_end; - unsigned long vmlinux_lma = 0; + unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size; + unsigned long nokaslr_offset_phys = mem_safe_offset(); unsigned long amode31_lma = 0; - unsigned long kernel_size; + unsigned long max_physmem_end; unsigned long asce_limit; unsigned long safe_addr; - void *img; psw_t psw; + fixup_vmlinux_info(); setup_lpp(); - safe_addr = mem_safe_offset(); + safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size); /* - * Reserve decompressor memory together with decompression heap, buffer and - * memory which might be occupied by uncompressed kernel at default 1Mb - * position (if KASLR is off or failed). + * Reserve decompressor memory together with decompression heap, + * buffer and memory which might be occupied by uncompressed kernel + * (if KASLR is off or failed). */ physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size) @@ -463,7 +480,6 @@ void startup_kernel(void) max_physmem_end = detect_max_physmem_end(); setup_ident_map_size(max_physmem_end); setup_vmalloc_size(); - kernel_size = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; asce_limit = setup_kernel_memory_layout(kernel_size); /* got final ident_map_size, physmem allocations could be performed now */ physmem_set_usable_limit(ident_map_size); @@ -472,32 +488,20 @@ void startup_kernel(void) rescue_initrd(safe_addr, ident_map_size); rescue_relocs(); - if (kaslr_enabled()) { - vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size, - THREAD_SIZE, vmlinux.default_lma, - ident_map_size); - if (vmlinux_lma) { - __kaslr_offset_phys = vmlinux_lma - vmlinux.default_lma; - kaslr_adjust_vmlinux_info(__kaslr_offset_phys); - } - } - vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma; - physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size); - - if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { - img = decompress_kernel(); - memmove((void *)vmlinux_lma, img, vmlinux.image_size); - } else if (__kaslr_offset_phys) { - img = (void *)vmlinux.default_lma; - memmove((void *)vmlinux_lma, img, vmlinux.image_size); - memset(img, 0, vmlinux.image_size); - } + if (kaslr_enabled()) + __kaslr_offset_phys = randomize_within_range(kernel_size, THREAD_SIZE, 0, ident_map_size); + if (!__kaslr_offset_phys) + __kaslr_offset_phys = nokaslr_offset_phys; + kaslr_adjust_vmlinux_info(__kaslr_offset_phys); + physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size); + deploy_kernel((void *)__kaslr_offset_phys); /* vmlinux decompression is done, shrink reserved low memory */ physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); if (kaslr_enabled()) amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G); - amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size; + if (!amode31_lma) + amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size; physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size); /* @@ -513,8 +517,8 @@ void startup_kernel(void) * - copy_bootdata() must follow setup_vmem() to propagate changes * to bootdata made by setup_vmem() */ - clear_bss_section(vmlinux_lma); - kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, + clear_bss_section(__kaslr_offset_phys); + kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size, __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); free_relocs(); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 86edd432e593..96d48b7112d4 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -453,9 +453,6 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l * To allow prefixing the lowcore must be mapped with 4KB pages. * To prevent creation of a large page at address 0 first map * the lowcore and create the identity mapping only afterwards. - * - * Skip 0x100000 bytes for kernel pgtables, as per the linker script: - * . = 0x100000; */ pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT); for_each_physmem_usable_range(i, &start, &end) { @@ -463,7 +460,7 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l (unsigned long)__identity_va(end), POPULATE_IDENTITY); } - pgtable_populate(kernel_start + 0x100000, kernel_end, POPULATE_KERNEL); + pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL); pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT); pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore), POPULATE_ABS_LOWCORE); @@ -471,7 +468,7 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l POPULATE_NONE); memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area)); - kasan_populate_shadow(kernel_start + 0x100000, kernel_end); + kasan_populate_shadow(kernel_start, kernel_end); S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits; S390_lowcore.user_asce = s390_invalid_asce; diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 3d7ea585ab99..9d6d26908355 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -100,7 +100,8 @@ SECTIONS _decompressor_end = .; #ifdef CONFIG_KERNEL_UNCOMPRESSED - . = 0x100000; + . = ALIGN(PAGE_SIZE); + . += AMODE31_SIZE; /* .amode31 section */ #else . = ALIGN(8); #endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index e1b6355ddbef..224ff9d433ea 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -273,7 +273,10 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #include #include +#define AMODE31_SIZE (3 * PAGE_SIZE) + #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) +#define __START_KERNEL 0x100000 #define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE #define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 48de296e8905..023fade1da3b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -39,7 +39,7 @@ PHDRS { SECTIONS { - . = 0x100000; + . = __START_KERNEL; .text : { _stext = .; /* Start of text section */ _text = .; /* Text and read-only data */ @@ -183,7 +183,7 @@ SECTIONS .amode31.data : { *(.amode31.data) } - . = ALIGN(PAGE_SIZE); + . = _samode31 + AMODE31_SIZE; _eamode31 = .; /* early.c uses stsi, which requires page aligned data. */ @@ -230,7 +230,6 @@ SECTIONS * it should match struct vmlinux_info */ .vmlinux.info 0 (INFO) : { - QUAD(_stext) /* default_lma */ QUAD(startup_continue) /* entry */ QUAD(__bss_start - _stext) /* image_size */ QUAD(__bss_stop - __bss_start) /* bss_size */ diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c index b4f35506779b..a74dbd5c9896 100644 --- a/arch/s390/tools/relocs.c +++ b/arch/s390/tools/relocs.c @@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel) case R_390_GOTOFF64: break; case R_390_64: - add_reloc(&relocs64, offset - (ehdr.e_entry - 0x100000)); + add_reloc(&relocs64, offset - ehdr.e_entry); break; default: die("Unsupported relocation type: %d\n", r_type); From 236d70f82bec6e1f3ab50a5242fc1c7f779e941e Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 26 Feb 2024 11:47:40 +0100 Subject: [PATCH 36/54] s390/boot: Do not rescue .vmlinux.relocs section The .vmlinux.relocs section is moved in front of the compressed kernel. The interim section rescue step is avoided as result. Suggested-by: Sumanth Korikkar Signed-off-by: Alexander Gordeev --- arch/s390/boot/startup.c | 35 +++++++++++----------------- arch/s390/boot/vmlinux.lds.S | 27 +++++++-------------- arch/s390/include/asm/physmem_info.h | 1 - 3 files changed, 22 insertions(+), 41 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 949eff7107cc..246d54499c20 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -185,26 +185,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, } static void kaslr_adjust_got(unsigned long offset) {} -static void rescue_relocs(void) {} -static void free_relocs(void) {} #else -static int *vmlinux_relocs_64_start; -static int *vmlinux_relocs_64_end; - -static void rescue_relocs(void) -{ - unsigned long size = __vmlinux_relocs_64_end - __vmlinux_relocs_64_start; - - vmlinux_relocs_64_start = (void *)physmem_alloc_top_down(RR_RELOC, size, 0); - vmlinux_relocs_64_end = (void *)vmlinux_relocs_64_start + size; - memmove(vmlinux_relocs_64_start, __vmlinux_relocs_64_start, size); -} - -static void free_relocs(void) -{ - physmem_free(RR_RELOC); -} - static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset, unsigned long phys_offset) { @@ -212,7 +193,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, long loc; /* Adjust R_390_64 relocations */ - for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) { + for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) { loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) error("64-bit relocation outside of kernel!\n"); @@ -486,7 +467,6 @@ void startup_kernel(void) detect_physmem_online_ranges(max_physmem_end); save_ipl_cert_comp_list(); rescue_initrd(safe_addr, ident_map_size); - rescue_relocs(); if (kaslr_enabled()) __kaslr_offset_phys = randomize_within_range(kernel_size, THREAD_SIZE, 0, ident_map_size); @@ -498,6 +478,18 @@ void startup_kernel(void) /* vmlinux decompression is done, shrink reserved low memory */ physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end); + + /* + * In case KASLR is enabled the randomized location of .amode31 + * section might overlap with .vmlinux.relocs section. To avoid that + * the below randomize_within_range() could have been called with + * __vmlinux_relocs_64_end as the lower range address. However, + * .amode31 section is written to by the decompressed kernel - at + * that time the contents of .vmlinux.relocs is not needed anymore. + * Conversly, .vmlinux.relocs is read only by the decompressor, even + * before the kernel started. Therefore, in case the two sections + * overlap there is no risk of corrupting any data. + */ if (kaslr_enabled()) amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G); if (!amode31_lma) @@ -521,7 +513,6 @@ void startup_kernel(void) kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size, __kaslr_offset, __kaslr_offset_phys); kaslr_adjust_got(__kaslr_offset); - free_relocs(); setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit); copy_bootdata(); diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index 9d6d26908355..d6454ec01e22 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -99,6 +99,15 @@ SECTIONS _decompressor_end = .; +#ifndef CONFIG_PIE_BUILD + . = ALIGN(4); + .vmlinux.relocs : { + __vmlinux_relocs_64_start = .; + *(.vmlinux.relocs_64) + __vmlinux_relocs_64_end = .; + } +#endif + #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); . += AMODE31_SIZE; /* .amode31 section */ @@ -111,24 +120,6 @@ SECTIONS _compressed_end = .; } -#ifndef CONFIG_PIE_BUILD - /* - * When the kernel is built with CONFIG_KERNEL_UNCOMPRESSED, the entire - * uncompressed vmlinux.bin is positioned in the bzImage decompressor - * image at the default kernel LMA of 0x100000, enabling it to be - * executed in-place. However, the size of .vmlinux.relocs could be - * large enough to cause an overlap with the uncompressed kernel at the - * address 0x100000. To address this issue, .vmlinux.relocs is - * positioned after the .rodata.compressed. - */ - . = ALIGN(4); - .vmlinux.relocs : { - __vmlinux_relocs_64_start = .; - *(.vmlinux.relocs_64) - __vmlinux_relocs_64_end = .; - } -#endif - #define SB_TRAILER_SIZE 32 /* Trailer needed for Secure Boot */ . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */ diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h index d2f1cc742695..f45cfc8bc233 100644 --- a/arch/s390/include/asm/physmem_info.h +++ b/arch/s390/include/asm/physmem_info.h @@ -22,7 +22,6 @@ enum reserved_range_type { RR_DECOMPRESSOR, RR_INITRD, RR_VMLINUX, - RR_RELOC, RR_AMODE31, RR_IPLREPORT, RR_CERT_COMP_LIST, From ea84f14d2a6b1a4fde17d2713dbdfdef7b84da87 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 17 Jan 2024 11:50:46 +0100 Subject: [PATCH 37/54] s390/nospec: Correct modules thunk offset calculation Fix offset calculation when branch target is more then 2Gb away. Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/kernel/nospec-branch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index d1b16d83e49a..9b8c24ebb008 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -114,10 +114,10 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) type = BRASL_EXPOLINE; /* brasl instruction */ else continue; - thunk = instr + (*(int *)(instr + 2)) * 2; + thunk = instr + (long)(*(int *)(instr + 2)) * 2; if (thunk[0] == 0xc6 && thunk[1] == 0x00) /* exrl %r0, */ - br = thunk + (*(int *)(thunk + 2)) * 2; + br = thunk + (long)(*(int *)(thunk + 2)) * 2; else continue; if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) From ba05b39d54eef78043b5c8ee90545cb06a98ae6f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 17 Jan 2024 11:50:49 +0100 Subject: [PATCH 38/54] s390/expoline: Make modules use kernel expolines Currently, kernel modules contain their own set of expoline thunks. In the case of EXPOLINE_EXTERN, this involves postlinking of precompiled expoline.o. expoline.o is also necessary for out-of-source tree module builds. Now that the kernel modules area is less than 4 GB away from kernel expoline thunks, make modules use kernel expolines. Also make EXPOLINE_EXTERN the default if the compiler supports it. This simplifies build and aligns with the approach adopted by other architectures. Signed-off-by: Vasily Gorbik Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 2 +- arch/s390/Makefile | 6 ------ arch/s390/include/asm/asm-prototypes.h | 1 + arch/s390/include/asm/nospec-branch.h | 20 ++++++++++++++++++++ arch/s390/include/asm/nospec-insn.h | 13 +++++++------ arch/s390/lib/Makefile | 2 +- arch/s390/lib/{expoline => }/expoline.S | 0 arch/s390/lib/expoline/Makefile | 3 --- scripts/mod/modpost.c | 5 ----- 9 files changed, 30 insertions(+), 22 deletions(-) rename arch/s390/lib/{expoline => }/expoline.S (100%) delete mode 100644 arch/s390/lib/expoline/Makefile diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index df6b371ed214..a077ded1b9e6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -555,7 +555,7 @@ config EXPOLINE If unsure, say N. config EXPOLINE_EXTERN - def_bool n + def_bool y if EXPOLINE depends on EXPOLINE depends on CC_IS_GCC && GCC_VERSION >= 110200 depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC)) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 2dbb2d2f22f9..64821f54f1e0 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -88,7 +88,6 @@ endif ifdef CONFIG_EXPOLINE ifdef CONFIG_EXPOLINE_EXTERN - KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern else @@ -167,11 +166,6 @@ vdso_prepare: prepare0 vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg -ifdef CONFIG_EXPOLINE_EXTERN -modules_prepare: expoline_prepare -expoline_prepare: scripts - $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o -endif endif # Don't use tabs in echo arguments diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h index 56096ae26f29..f662eb4b9246 100644 --- a/arch/s390/include/asm/asm-prototypes.h +++ b/arch/s390/include/asm/asm-prototypes.h @@ -4,6 +4,7 @@ #include #include #include +#include #include __int128_t __ashlti3(__int128_t a, int b); diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index 82725cf783c7..b9c1f3cae842 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -17,6 +17,26 @@ static inline bool nospec_uses_trampoline(void) return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; } +#ifdef CONFIG_EXPOLINE_EXTERN + +void __s390_indirect_jump_r1(void); +void __s390_indirect_jump_r2(void); +void __s390_indirect_jump_r3(void); +void __s390_indirect_jump_r4(void); +void __s390_indirect_jump_r5(void); +void __s390_indirect_jump_r6(void); +void __s390_indirect_jump_r7(void); +void __s390_indirect_jump_r8(void); +void __s390_indirect_jump_r9(void); +void __s390_indirect_jump_r10(void); +void __s390_indirect_jump_r11(void); +void __s390_indirect_jump_r12(void); +void __s390_indirect_jump_r13(void); +void __s390_indirect_jump_r14(void); +void __s390_indirect_jump_r15(void); + +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 7a946c42ad13..cb15dd25bf21 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -16,24 +16,25 @@ */ .macro __THUNK_PROLOG_NAME name #ifdef CONFIG_EXPOLINE_EXTERN - .pushsection .text,"ax",@progbits - __ALIGN + SYM_CODE_START(\name) #else .pushsection .text.\name,"axG",@progbits,\name,comdat -#endif .globl \name .hidden \name .type \name,@function \name: CFI_STARTPROC +#endif .endm .macro __THUNK_EPILOG_NAME name - CFI_ENDPROC #ifdef CONFIG_EXPOLINE_EXTERN - .size \name, .-\name -#endif + SYM_CODE_END(\name) + EXPORT_SYMBOL(\name) +#else + CFI_ENDPROC .popsection +#endif .endm .macro __THUNK_PROLOG_BR r1 diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index 90eac15ea62a..f43f897d3fc0 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -23,4 +23,4 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o -obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/ +obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline.S similarity index 100% rename from arch/s390/lib/expoline/expoline.S rename to arch/s390/lib/expoline.S diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile deleted file mode 100644 index 854631d9cb03..000000000000 --- a/arch/s390/lib/expoline/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 - -obj-y += expoline.o diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 2f5b91da5afa..937294ff164f 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -601,11 +601,6 @@ static int ignore_undef_symbol(struct elf_info *info, const char *symname) strstarts(symname, "_savevr_") || strcmp(symname, ".TOC.") == 0) return 1; - - if (info->hdr->e_machine == EM_S390) - /* Expoline thunks are linked on all kernel modules during final link of .ko */ - if (strstarts(symname, "__s390_indirect_jump_r")) - return 1; /* Do not ignore this symbol */ return 0; } From 90a7592da14951bd21f74a53246ba30955a648aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Apr 2024 18:14:40 +0200 Subject: [PATCH 39/54] mm/userfaultfd: Do not place zeropages when zeropages are disallowed s390x must disable shared zeropages for processes running VMs, because the VMs could end up making use of "storage keys" or protected virtualization, which are incompatible with shared zeropages. Yet, with userfaultfd it is possible to insert shared zeropages into such processes. Let's fallback to simply allocating a fresh zeroed anonymous folio and insert that instead. mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce mm_forbids_zeropage function"), briefly before userfaultfd went upstream. Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do for hugetlb, it would be rather unexpected. Further, we also cannot really indicated "not supported" to user space ahead of time: it could be that the MM disallows zeropages after userfaultfd was already registered. [ agordeev: Fixed checkpatch complaints ] Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation") Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240411161441.910170-2-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Alexander Gordeev --- mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3c3539c573e7..829f7b1089fc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -316,6 +316,38 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, goto out; } +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct folio *folio; + int ret = -ENOMEM; + + folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); + if (!folio) + return ret; + + if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) + goto out_put; + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * zeroing out the folio become visible before mapping the page + * using set_pte_at(). See do_anonymous_page(). + */ + __folio_mark_uptodate(folio); + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, 0); + if (ret) + goto out_put; + + return 0; +out_put: + folio_put(folio); + return ret; +} + static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -324,6 +356,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, spinlock_t *ptl; int ret; + if (mm_forbids_zeropage(dst_vma->vm_mm)) + return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; From 06201e00ee3e4beacac48aab2b83eff64ebf0bc0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Apr 2024 18:14:41 +0200 Subject: [PATCH 40/54] s390/mm: Re-enable the shared zeropage for !PV and !skeys KVM guests commit fa41ba0d08de ("s390/mm: avoid empty zero pages for KVM guests to avoid postcopy hangs") introduced an undesired side effect when combined with memory ballooning and VM migration: memory part of the inflated memory balloon will consume memory. Assuming we have a 100GiB VM and inflated the balloon to 40GiB. Our VM will consume ~60GiB of memory. If we now trigger a VM migration, hypervisors like QEMU will read all VM memory. As s390x does not support the shared zeropage, we'll end up allocating for all previously-inflated memory part of the memory balloon: 50 GiB. So we might easily (unexpectedly) crash the VM on the migration source. Even worse, hypervisors like QEMU optimize for zeropage migration to not consume memory on the migration destination: when migrating a "page full of zeroes", on the migration destination they check whether the target memory is already zero (by reading the destination memory) and avoid writing to the memory to not allocate memory: however, s390x will also allocate memory here, implying that also on the migration destination, we will end up allocating all previously-inflated memory part of the memory balloon. This is especially bad if actual memory overcommit was not desired, when memory ballooning is used for dynamic VM memory resizing, setting aside some memory during boot that can be added later on demand. Alternatives like virtio-mem that would avoid this issue are not yet available on s390x. There could be ways to optimize some cases in user space: before reading memory in an anonymous private mapping on the migration source, check via /proc/self/pagemap if anything is already populated. Similarly check on the migration destination before reading. While that would avoid populating tables full of shared zeropages on all architectures, it's harder to get right and performant, and requires user space changes. Further, with posctopy live migration we must place a page, so there, "avoid touching memory to avoid allocating memory" is not really possible. (Note that a previously we would have falsely inserted shared zeropages into processes using UFFDIO_ZEROPAGE where mm_forbids_zeropage() would have actually forbidden it) PV is currently incompatible with memory ballooning, and in the common case, KVM guests don't make use of storage keys. Instead of zapping zeropages when enabling storage keys / PV, that turned out to be problematic in the past, let's do exactly the same we do with KSM pages: trigger unsharing faults to replace the shared zeropages by proper anonymous folios. What about added latency when enabling storage kes? Having a lot of zeropages in applicable environments (PV, legacy guests, unittests) is unexpected. Further, KSM could today already unshare the zeropages and unmerging KSM pages when enabling storage kets would unshare the KSM-placed zeropages in the same way, resulting in the same latency. [ agordeev: Fixed sparse and checkpatch complaints and error handling ] Reviewed-by: Christian Borntraeger Tested-by: Christian Borntraeger Fixes: fa41ba0d08de ("s390/mm: avoid empty zero pages for KVM guests to avoid postcopy hangs") Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20240411161441.910170-3-david@redhat.com Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/gmap.h | 2 +- arch/s390/include/asm/mmu.h | 5 + arch/s390/include/asm/mmu_context.h | 1 + arch/s390/include/asm/pgtable.h | 16 ++- arch/s390/kvm/kvm-s390.c | 4 +- arch/s390/mm/gmap.c | 165 +++++++++++++++++++++------- 6 files changed, 146 insertions(+), 47 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 5cc46e0dde62..9725586f4259 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -146,7 +146,7 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start, void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); -int gmap_mark_unmergeable(void); +int s390_disable_cow_sharing(void); void s390_unlist_old_asce(struct gmap *gmap); int s390_replace_asce(struct gmap *gmap); void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bb1b4bef1878..4c2dc7abc285 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -32,6 +32,11 @@ typedef struct { unsigned int uses_skeys:1; /* The mmu context uses CMM. */ unsigned int uses_cmm:1; + /* + * The mmu context allows COW-sharing of memory pages (KSM, zeropage). + * Note that COW-sharing during fork() is currently always allowed. + */ + unsigned int allow_cow_sharing:1; /* The gmaps associated with this context are allowed to use huge pages. */ unsigned int allow_gmap_hpage_1m:1; } mm_context_t; diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 929af18b0908..a7789a9f6218 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -35,6 +35,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.has_pgste = 0; mm->context.uses_skeys = 0; mm->context.uses_cmm = 0; + mm->context.allow_cow_sharing = 1; mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 60950e7a25f5..259c2439c251 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -566,10 +566,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) } /* - * In the case that a guest uses storage keys - * faults should no longer be backed by zero pages + * As soon as the guest uses storage keys or enables PV, we deduplicate all + * mapped shared zeropages and prevent new shared zeropages from getting + * mapped. */ -#define mm_forbids_zeropage mm_has_pgste +#define mm_forbids_zeropage mm_forbids_zeropage +static inline int mm_forbids_zeropage(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (!mm->context.allow_cow_sharing) + return 1; +#endif + return 0; +} + static inline int mm_uses_skeys(struct mm_struct *mm) { #ifdef CONFIG_PGSTE diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5147b943a864..db3392f0be21 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2631,9 +2631,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (r) break; - mmap_write_lock(current->mm); - r = gmap_mark_unmergeable(); - mmap_write_unlock(current->mm); + r = s390_disable_cow_sharing(); if (r) break; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 094b43b121cd..f2988bbcebbe 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2549,41 +2549,6 @@ static inline void thp_split_mm(struct mm_struct *mm) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* - * Remove all empty zero pages from the mapping for lazy refaulting - * - This must be called after mm->context.has_pgste is set, to avoid - * future creation of zero pages - * - This must be called after THP was disabled. - * - * mm contracts with s390, that even if mm were to remove a page table, - * racing with the loop below and so causing pte_offset_map_lock() to fail, - * it will never insert a page table containing empty zero pages once - * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set. - */ -static int __zap_zero_pages(pmd_t *pmd, unsigned long start, - unsigned long end, struct mm_walk *walk) -{ - unsigned long addr; - - for (addr = start; addr != end; addr += PAGE_SIZE) { - pte_t *ptep; - spinlock_t *ptl; - - ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (!ptep) - break; - if (is_zero_pfn(pte_pfn(*ptep))) - ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); - pte_unmap_unlock(ptep, ptl); - } - return 0; -} - -static const struct mm_walk_ops zap_zero_walk_ops = { - .pmd_entry = __zap_zero_pages, - .walk_lock = PGWALK_WRLOCK, -}; - /* * switch on pgstes for its userspace process (for kvm) */ @@ -2601,22 +2566,142 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); mmap_write_unlock(mm); return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); -int gmap_mark_unmergeable(void) +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) { + unsigned long *found_addr = walk->private; + + /* Return 1 of the page is a zeropage. */ + if (is_zero_pfn(pte_pfn(*pte))) { + /* + * Shared zeropage in e.g., a FS DAX mapping? We cannot do the + * right thing and likely don't care: FAULT_FLAG_UNSHARE + * currently only works in COW mappings, which is also where + * mm_forbids_zeropage() is checked. + */ + if (!is_cow_mapping(walk->vma->vm_flags)) + return -EFAULT; + + *found_addr = addr; + return 1; + } + return 0; +} + +static const struct mm_walk_ops find_zeropage_ops = { + .pte_entry = find_zeropage_pte_entry, + .walk_lock = PGWALK_WRLOCK, +}; + +/* + * Unshare all shared zeropages, replacing them by anonymous pages. Note that + * we cannot simply zap all shared zeropages, because this could later + * trigger unexpected userfaultfd missing events. + * + * This must be called after mm->context.allow_cow_sharing was + * set to 0, to avoid future mappings of shared zeropages. + * + * mm contracts with s390, that even if mm were to remove a page table, + * and racing with walk_page_range_vma() calling pte_offset_map_lock() + * would fail, it will never insert a page table containing empty zero + * pages once mm_forbids_zeropage(mm) i.e. + * mm->context.allow_cow_sharing is set to 0. + */ +static int __s390_unshare_zeropages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + unsigned long addr; + vm_fault_t fault; + int rc; + + for_each_vma(vmi, vma) { + /* + * We could only look at COW mappings, but it's more future + * proof to catch unexpected zeropages in other mappings and + * fail. + */ + if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) + continue; + addr = vma->vm_start; + +retry: + rc = walk_page_range_vma(vma, addr, vma->vm_end, + &find_zeropage_ops, &addr); + if (rc < 0) + return rc; + else if (!rc) + continue; + + /* addr was updated by find_zeropage_pte_entry() */ + fault = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + if (fault & VM_FAULT_OOM) + return -ENOMEM; + /* + * See break_ksm(): even after handle_mm_fault() returned 0, we + * must start the lookup from the current address, because + * handle_mm_fault() may back out if there's any difficulty. + * + * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but + * maybe they could trigger in the future on concurrent + * truncation. In that case, the shared zeropage would be gone + * and we can simply retry and make progress. + */ + cond_resched(); + goto retry; + } + + return 0; +} + +static int __s390_disable_cow_sharing(struct mm_struct *mm) +{ + int rc; + + if (!mm->context.allow_cow_sharing) + return 0; + + mm->context.allow_cow_sharing = 0; + + /* Replace all shared zeropages by anonymous pages. */ + rc = __s390_unshare_zeropages(mm); /* * Make sure to disable KSM (if enabled for the whole process or * individual VMAs). Note that nothing currently hinders user space * from re-enabling it. */ - return ksm_disable(current->mm); + if (!rc) + rc = ksm_disable(mm); + if (rc) + mm->context.allow_cow_sharing = 1; + return rc; } -EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); + +/* + * Disable most COW-sharing of memory pages for the whole process: + * (1) Disable KSM and unmerge/unshare any KSM pages. + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. + * + * Not that we currently don't bother with COW-shared pages that are shared + * with parent/child processes due to fork(). + */ +int s390_disable_cow_sharing(void) +{ + int rc; + + mmap_write_lock(current->mm); + rc = __s390_disable_cow_sharing(current->mm); + mmap_write_unlock(current->mm); + return rc; +} +EXPORT_SYMBOL_GPL(s390_disable_cow_sharing); /* * Enable storage key handling from now on and initialize the storage @@ -2685,7 +2770,7 @@ int s390_enable_skey(void) goto out_up; mm->context.uses_skeys = 1; - rc = gmap_mark_unmergeable(); + rc = __s390_disable_cow_sharing(mm); if (rc) { mm->context.uses_skeys = 0; goto out_up; From 6e6973948ce6911e35cb559bc53de4156b4a9675 Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 15 Apr 2024 11:25:51 -0400 Subject: [PATCH 41/54] s390/ap: Externalize AP bus specific bitmap reading function Rename hex2bitmap() to ap_hex2bitmap() and export it for external use. This function will be used by the implementation of the vfio-ap ap_config sysfs attribute. Signed-off-by: "Jason J. Herne" Reviewed-by: Tony Krowiak Reviewed-by: Harald Freudenberger Link: https://lore.kernel.org/r/20240415152555.13152-2-jjherne@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/ap_bus.c | 13 +++---------- drivers/s390/crypto/ap_bus.h | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 09059b3a3a42..c20b45092079 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1043,15 +1043,7 @@ static struct notifier_block ap_bus_nb = { .notifier_call = ap_bus_cfg_chg, }; -/* - * hex2bitmap() - parse hex mask string and set bitmap. - * Valid strings are "0x012345678" with at least one valid hex number. - * Rest of the bitmap to the right is padded with 0. No spaces allowed - * within the string, the leading 0x may be omitted. - * Returns the bitmask with exactly the bits set as given by the hex - * string (both in big endian order). - */ -static int hex2bitmap(const char *str, unsigned long *bitmap, int bits) +int ap_hex2bitmap(const char *str, unsigned long *bitmap, int bits) { int i, n, b; @@ -1078,6 +1070,7 @@ static int hex2bitmap(const char *str, unsigned long *bitmap, int bits) return -EINVAL; return 0; } +EXPORT_SYMBOL(ap_hex2bitmap); /* * modify_bitmap() - parse bitmask argument and modify an existing @@ -1143,7 +1136,7 @@ static int ap_parse_bitmap_str(const char *str, unsigned long *bitmap, int bits, rc = modify_bitmap(str, newmap, bits); } else { memset(newmap, 0, size); - rc = hex2bitmap(str, newmap, bits); + rc = ap_hex2bitmap(str, newmap, bits); } return rc; } diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 59c7ed49aa02..fdbc6fdfdf57 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -343,6 +343,28 @@ int ap_parse_mask_str(const char *str, unsigned long *bitmap, int bits, struct mutex *lock); +/* + * ap_hex2bitmap() - Convert a string containing a hexadecimal number (str) + * into a bitmap (bitmap) with bits set that correspond to the bits represented + * by the hex string. Input and output data is in big endian order. + * + * str - Input hex string of format "0x1234abcd". The leading "0x" is optional. + * At least one digit is required. Must be large enough to hold the number of + * bits represented by the bits parameter. + * + * bitmap - Pointer to a bitmap. Upon successful completion of this function, + * this bitmap will have bits set to match the value of str. If bitmap is longer + * than str, then the rightmost bits of bitmap are padded with zeros. Must be + * large enough to hold the number of bits represented by the bits parameter. + * + * bits - Length, in bits, of the bitmap represented by str. Must be a multiple + * of 8. + * + * Returns: 0 On success + * -EINVAL If str format is invalid or bits is not a multiple of 8. + */ +int ap_hex2bitmap(const char *str, unsigned long *bitmap, int bits); + /* * Interface to wait for the AP bus to have done one initial ap bus * scan and all detected APQNs have been bound to device drivers. From e12aa0b5b2fbf324daf62245e76083e6065ec47c Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 15 Apr 2024 11:25:52 -0400 Subject: [PATCH 42/54] s390/vfio-ap: Add sysfs attr, ap_config, to export mdev state Add ap_config sysfs attribute. This will provide the means for setting or displaying the adapters, domains and control domains assigned to the vfio-ap mediated device in a single operation. This sysfs attribute is comprised of three masks: One for adapters, one for domains, and one for control domains. This attribute is intended to be used by mdevctl to query a vfio-ap mediated device's state. Signed-off-by: "Jason J. Herne" Reviewed-by: Tony Krowiak Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/20240415152555.13152-3-jjherne@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index fc169bc61593..e01f53a3c5b7 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1570,6 +1570,32 @@ static ssize_t guest_matrix_show(struct device *dev, } static DEVICE_ATTR_RO(guest_matrix); +static ssize_t write_ap_bitmap(unsigned long *bitmap, char *buf, int offset, char sep) +{ + return sysfs_emit_at(buf, offset, "0x%016lx%016lx%016lx%016lx%c", + bitmap[0], bitmap[1], bitmap[2], bitmap[3], sep); +} + +static ssize_t ap_config_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); + int idx = 0; + + idx += write_ap_bitmap(matrix_mdev->matrix.apm, buf, idx, ','); + idx += write_ap_bitmap(matrix_mdev->matrix.aqm, buf, idx, ','); + idx += write_ap_bitmap(matrix_mdev->matrix.adm, buf, idx, '\n'); + + return idx; +} + +static ssize_t ap_config_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return count; +} +static DEVICE_ATTR_RW(ap_config); + static struct attribute *vfio_ap_mdev_attrs[] = { &dev_attr_assign_adapter.attr, &dev_attr_unassign_adapter.attr, @@ -1577,6 +1603,7 @@ static struct attribute *vfio_ap_mdev_attrs[] = { &dev_attr_unassign_domain.attr, &dev_attr_assign_control_domain.attr, &dev_attr_unassign_control_domain.attr, + &dev_attr_ap_config.attr, &dev_attr_control_domains.attr, &dev_attr_matrix.attr, &dev_attr_guest_matrix.attr, From f3e3a4008c1d06707fd57664e3071f43b417762d Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 15 Apr 2024 11:25:53 -0400 Subject: [PATCH 43/54] s390/vfio-ap: Ignore duplicate link requests in vfio_ap_mdev_link_queue vfio_ap_mdev_link_queue is changed to detect if a matrix_mdev has already linked the given queue. If so, it bails out. Signed-off-by: "Jason J. Herne" Reviewed-by: Tony Krowiak Link: https://lore.kernel.org/r/20240415152555.13152-4-jjherne@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index e01f53a3c5b7..1499c2181122 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -794,10 +794,11 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev) static void vfio_ap_mdev_link_queue(struct ap_matrix_mdev *matrix_mdev, struct vfio_ap_queue *q) { - if (q) { - q->matrix_mdev = matrix_mdev; - hash_add(matrix_mdev->qtable.queues, &q->mdev_qnode, q->apqn); - } + if (!q || vfio_ap_mdev_get_queue(matrix_mdev, q->apqn)) + return; + + q->matrix_mdev = matrix_mdev; + hash_add(matrix_mdev->qtable.queues, &q->mdev_qnode, q->apqn); } static void vfio_ap_mdev_link_apqn(struct ap_matrix_mdev *matrix_mdev, int apqn) From 8fb456bc9f9b136d5369f8fdd54597edd692d36d Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 15 Apr 2024 11:25:54 -0400 Subject: [PATCH 44/54] s390/vfio-ap: Add write support to sysfs attr ap_config Allow writing a complete set of masks to ap_config. Doing so will cause the vfio-ap driver to replace the vfio-ap mediated device's matrix masks with the given set of masks. If the given state cannot be set, then no changes are made to the vfio-ap mediated device. The format of the data written to ap_config is as follows: {amask},{dmask},{cmask}\n \n is a newline character. amask, dmask, and cmask are masks identifying which adapters, domains, and control domains should be assigned to the mediated device. The format of a mask is as follows: 0xNN..NN Where NN..NN is 64 hexadecimal characters representing a 256-bit value. The leftmost (highest order) bit represents adapter/domain 0. For an example set of masks that represent your mdev's current configuration, simply cat ap_config. This attribute is intended to be used by an mdevctl callout script supporting the mdev type vfio_ap-passthrough to atomically update a vfio-ap mediated device's state. Signed-off-by: "Jason J. Herne" Reviewed-by: Tony Krowiak Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/20240415152555.13152-5-jjherne@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 190 ++++++++++++++++++++++++-- drivers/s390/crypto/vfio_ap_private.h | 6 +- 2 files changed, 180 insertions(+), 16 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 1499c2181122..9f76f2d7b66e 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1119,20 +1119,29 @@ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev, } } -static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev, - unsigned long apid) +static void vfio_ap_mdev_hot_unplug_adapters(struct ap_matrix_mdev *matrix_mdev, + unsigned long *apids) { struct vfio_ap_queue *q, *tmpq; struct list_head qlist; + unsigned long apid; + bool apcb_update = false; INIT_LIST_HEAD(&qlist); - vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist); - if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) { - clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm); - vfio_ap_mdev_update_guest_apcb(matrix_mdev); + for_each_set_bit_inv(apid, apids, AP_DEVICES) { + vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist); + + if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) { + clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm); + apcb_update = true; + } } + /* Only update apcb if needed to avoid impacting guest */ + if (apcb_update) + vfio_ap_mdev_update_guest_apcb(matrix_mdev); + vfio_ap_mdev_reset_qlist(&qlist); list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) { @@ -1141,6 +1150,16 @@ static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev, } } +static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev, + unsigned long apid) +{ + DECLARE_BITMAP(apids, AP_DEVICES); + + bitmap_zero(apids, AP_DEVICES); + set_bit_inv(apid, apids); + vfio_ap_mdev_hot_unplug_adapters(matrix_mdev, apids); +} + /** * unassign_adapter_store - parses the APID from @buf and clears the * corresponding bit in the mediated matrix device's APM @@ -1301,20 +1320,29 @@ static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev, } } -static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev, - unsigned long apqi) +static void vfio_ap_mdev_hot_unplug_domains(struct ap_matrix_mdev *matrix_mdev, + unsigned long *apqis) { struct vfio_ap_queue *q, *tmpq; struct list_head qlist; + unsigned long apqi; + bool apcb_update = false; INIT_LIST_HEAD(&qlist); - vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist); - if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) { - clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm); - vfio_ap_mdev_update_guest_apcb(matrix_mdev); + for_each_set_bit_inv(apqi, apqis, AP_DOMAINS) { + vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist); + + if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) { + clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm); + apcb_update = true; + } } + /* Only update apcb if needed to avoid impacting guest */ + if (apcb_update) + vfio_ap_mdev_update_guest_apcb(matrix_mdev); + vfio_ap_mdev_reset_qlist(&qlist); list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) { @@ -1323,6 +1351,16 @@ static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev, } } +static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev, + unsigned long apqi) +{ + DECLARE_BITMAP(apqis, AP_DOMAINS); + + bitmap_zero(apqis, AP_DEVICES); + set_bit_inv(apqi, apqis); + vfio_ap_mdev_hot_unplug_domains(matrix_mdev, apqis); +} + /** * unassign_domain_store - parses the APQI from @buf and clears the * corresponding bit in the mediated matrix device's AQM @@ -1590,10 +1628,136 @@ static ssize_t ap_config_show(struct device *dev, struct device_attribute *attr, return idx; } +/* Number of characters needed for a complete hex mask representing the bits in .. */ +#define AP_DEVICES_STRLEN (AP_DEVICES / 4 + 3) +#define AP_DOMAINS_STRLEN (AP_DOMAINS / 4 + 3) +#define AP_CONFIG_STRLEN (AP_DEVICES_STRLEN + 2 * AP_DOMAINS_STRLEN) + +static int parse_bitmap(char **strbufptr, unsigned long *bitmap, int nbits) +{ + char *curmask; + + curmask = strsep(strbufptr, ",\n"); + if (!curmask) + return -EINVAL; + + bitmap_clear(bitmap, 0, nbits); + return ap_hex2bitmap(curmask, bitmap, nbits); +} + +static int ap_matrix_overflow_check(struct ap_matrix_mdev *matrix_mdev) +{ + unsigned long bit; + + for_each_set_bit_inv(bit, matrix_mdev->matrix.apm, AP_DEVICES) { + if (bit > matrix_mdev->matrix.apm_max) + return -ENODEV; + } + + for_each_set_bit_inv(bit, matrix_mdev->matrix.aqm, AP_DOMAINS) { + if (bit > matrix_mdev->matrix.aqm_max) + return -ENODEV; + } + + for_each_set_bit_inv(bit, matrix_mdev->matrix.adm, AP_DOMAINS) { + if (bit > matrix_mdev->matrix.adm_max) + return -ENODEV; + } + + return 0; +} + +static void ap_matrix_copy(struct ap_matrix *dst, struct ap_matrix *src) +{ + /* This check works around false positive gcc -Wstringop-overread */ + if (!src) + return; + + bitmap_copy(dst->apm, src->apm, AP_DEVICES); + bitmap_copy(dst->aqm, src->aqm, AP_DOMAINS); + bitmap_copy(dst->adm, src->adm, AP_DOMAINS); +} + static ssize_t ap_config_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - return count; + struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev); + struct ap_matrix m_new, m_old, m_added, m_removed; + DECLARE_BITMAP(apm_filtered, AP_DEVICES); + unsigned long newbit; + char *newbuf, *rest; + int rc = count; + bool do_update; + + newbuf = kstrndup(buf, AP_CONFIG_STRLEN, GFP_KERNEL); + if (!newbuf) + return -ENOMEM; + rest = newbuf; + + mutex_lock(&ap_perms_mutex); + get_update_locks_for_mdev(matrix_mdev); + + /* Save old state */ + ap_matrix_copy(&m_old, &matrix_mdev->matrix); + if (parse_bitmap(&rest, m_new.apm, AP_DEVICES) || + parse_bitmap(&rest, m_new.aqm, AP_DOMAINS) || + parse_bitmap(&rest, m_new.adm, AP_DOMAINS)) { + rc = -EINVAL; + goto out; + } + + bitmap_andnot(m_removed.apm, m_old.apm, m_new.apm, AP_DEVICES); + bitmap_andnot(m_removed.aqm, m_old.aqm, m_new.aqm, AP_DOMAINS); + bitmap_andnot(m_added.apm, m_new.apm, m_old.apm, AP_DEVICES); + bitmap_andnot(m_added.aqm, m_new.aqm, m_old.aqm, AP_DOMAINS); + + /* Need new bitmaps in matrix_mdev for validation */ + ap_matrix_copy(&matrix_mdev->matrix, &m_new); + + /* Ensure new state is valid, else undo new state */ + rc = vfio_ap_mdev_validate_masks(matrix_mdev); + if (rc) { + ap_matrix_copy(&matrix_mdev->matrix, &m_old); + goto out; + } + rc = ap_matrix_overflow_check(matrix_mdev); + if (rc) { + ap_matrix_copy(&matrix_mdev->matrix, &m_old); + goto out; + } + rc = count; + + /* Need old bitmaps in matrix_mdev for unplug/unlink */ + ap_matrix_copy(&matrix_mdev->matrix, &m_old); + + /* Unlink removed adapters/domains */ + vfio_ap_mdev_hot_unplug_adapters(matrix_mdev, m_removed.apm); + vfio_ap_mdev_hot_unplug_domains(matrix_mdev, m_removed.aqm); + + /* Need new bitmaps in matrix_mdev for linking new adapters/domains */ + ap_matrix_copy(&matrix_mdev->matrix, &m_new); + + /* Link newly added adapters */ + for_each_set_bit_inv(newbit, m_added.apm, AP_DEVICES) + vfio_ap_mdev_link_adapter(matrix_mdev, newbit); + + for_each_set_bit_inv(newbit, m_added.aqm, AP_DOMAINS) + vfio_ap_mdev_link_domain(matrix_mdev, newbit); + + /* filter resources not bound to vfio-ap */ + do_update = vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered); + do_update |= vfio_ap_mdev_filter_cdoms(matrix_mdev); + + /* Apply changes to shadow apbc if things changed */ + if (do_update) { + vfio_ap_mdev_update_guest_apcb(matrix_mdev); + reset_queues_for_apids(matrix_mdev, apm_filtered); + } +out: + release_update_locks_for_mdev(matrix_mdev); + mutex_unlock(&ap_perms_mutex); + kfree(newbuf); + return rc; } static DEVICE_ATTR_RW(ap_config); diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 98d37aa27044..437a161c8659 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -75,11 +75,11 @@ extern struct ap_matrix_dev *matrix_dev; */ struct ap_matrix { unsigned long apm_max; - DECLARE_BITMAP(apm, 256); + DECLARE_BITMAP(apm, AP_DEVICES); unsigned long aqm_max; - DECLARE_BITMAP(aqm, 256); + DECLARE_BITMAP(aqm, AP_DOMAINS); unsigned long adm_max; - DECLARE_BITMAP(adm, 256); + DECLARE_BITMAP(adm, AP_DOMAINS); }; /** From 3113a29e25fe072d6eb0abccd695d7cc72ac9b6f Mon Sep 17 00:00:00 2001 From: "Jason J. Herne" Date: Mon, 15 Apr 2024 11:25:55 -0400 Subject: [PATCH 45/54] docs: Update s390 vfio-ap doc for ap_config sysfs attribute A new sysfs attribute, ap_config, for the vfio_ap driver is documented. Signed-off-by: "Jason J. Herne" Reviewed-by: Tony Krowiak Link: https://lore.kernel.org/r/20240415152555.13152-6-jjherne@linux.ibm.com Signed-off-by: Alexander Gordeev --- Documentation/arch/s390/vfio-ap.rst | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst index cf8533f2a8b6..ea744cbc8687 100644 --- a/Documentation/arch/s390/vfio-ap.rst +++ b/Documentation/arch/s390/vfio-ap.rst @@ -380,6 +380,36 @@ matrix device. control_domains: A read-only file for displaying the control domain numbers assigned to the vfio_ap mediated device. + ap_config: + A read/write file that, when written to, allows all three of the + vfio_ap mediated device's ap matrix masks to be replaced in one shot. + Three masks are given, one for adapters, one for domains, and one for + control domains. If the given state cannot be set then no changes are + made to the vfio-ap mediated device. + + The format of the data written to ap_config is as follows: + {amask},{dmask},{cmask}\n + + \n is a newline character. + + amask, dmask, and cmask are masks identifying which adapters, domains, + and control domains should be assigned to the mediated device. + + The format of a mask is as follows: + 0xNN..NN + + Where NN..NN is 64 hexadecimal characters representing a 256-bit value. + The leftmost (highest order) bit represents adapter/domain 0. + + For an example set of masks that represent your mdev's current + configuration, simply cat ap_config. + + Setting an adapter or domain number greater than the maximum allowed for + the system will result in an error. + + This attribute is intended to be used by automation. End users would be + better served using the respective assign/unassign attributes for + adapters, domains, and control domains. * functions: From a2269a66eec37916e2bcc148b7f7ed398b66263f Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Tue, 23 Apr 2024 11:42:05 +0200 Subject: [PATCH 46/54] s390/os_info: Initialize old os_info in standalone dump kernel The commit be42660d0c13 ("s390/crash: use old os_info to create PT_LOAD headers") introduced use of the old os_info into standalone dump kernel. Before this change os_info_old_init() expected to be called only from a regular kdump kernel although the function itself is able to work in standalone dump kernels as well (because copy_oldmem_kernel() is able to handle both use cases). Therefore, fix the expectation of os_info_old_init() and enable it to be called from a standalone dump kernel. Fixes: f4cac27dc0d6 ("s390/crash: Use old os_info to create PT_LOAD headers") Acked-by: Alexander Gordeev Signed-off-by: Alexander Egorenkov Signed-off-by: Alexander Gordeev --- arch/s390/kernel/os_info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index 3f4dc045a894..cb73860c048d 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * OS info structure has to be page aligned @@ -146,7 +147,7 @@ static void os_info_old_init(void) if (os_info_init) return; - if (!oldmem_data.start) + if (!oldmem_data.start && !is_ipl_type_dump()) goto fail; if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr))) goto fail; From fe742c08f3d930d62647412f602d2b4a211a0a39 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 26 Apr 2024 08:02:06 +0200 Subject: [PATCH 47/54] s390/os_info: Fix array size in struct os_info gcc's -Warray-bounds warned about an out-of-bounds access to the entry array contained in struct os_info. This doesn't trigger a bug right now because there's a large reserved space after the array. Nevertheless fix this, and also add a BUILD_BUG_ON to make sure struct os_info is always exactly on page in size. Fixes: f4cac27dc0d6 ("s390/crash: Use old os_info to create PT_LOAD headers") Reviewed-by: Alexander Gordeev Signed-off-by: Sven Schnelle Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/os_info.h | 5 +++-- arch/s390/kernel/os_info.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h index dea2b37b635e..3ee9e8f5ceae 100644 --- a/arch/s390/include/asm/os_info.h +++ b/arch/s390/include/asm/os_info.h @@ -27,6 +27,7 @@ #define OS_INFO_IMAGE_START 10 #define OS_INFO_IMAGE_END 11 #define OS_INFO_IMAGE_PHYS 12 +#define OS_INFO_MAX 13 #define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0) @@ -46,8 +47,8 @@ struct os_info { u16 version_minor; u64 crashkernel_addr; u64 crashkernel_size; - struct os_info_entry entry[10]; - u8 reserved[3864]; + struct os_info_entry entry[OS_INFO_MAX]; + u8 reserved[3804]; } __packed; void os_info_init(void); diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c index cb73860c048d..b695f980bbde 100644 --- a/arch/s390/kernel/os_info.c +++ b/arch/s390/kernel/os_info.c @@ -73,6 +73,7 @@ void __init os_info_init(void) { struct lowcore *abs_lc; + BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE); os_info.version_major = OS_INFO_VERSION_MAJOR; os_info.version_minor = OS_INFO_VERSION_MINOR; os_info.magic = OS_INFO_MAGIC; From 9679fec2cad447d70f32142c194f2d1c8544717c Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 23 Apr 2024 16:27:24 +0200 Subject: [PATCH 48/54] s390/pci: Drop unneeded reference to CONFIG_DMI The S/390 architecture doesn't support SMBIOS, so CONFIG_DMI will never be defined there. So we can simply omit these preprocessing directives and speed up the build a bit. Signed-off-by: Jean Delvare Cc: Niklas Schnelle Cc: Gerald Schaefer Acked-by: Niklas Schnelle Link: https://lore.kernel.org/r/20240423162724.3966265a@endymion.delvare Signed-off-by: Alexander Gordeev --- arch/s390/pci/pci_sysfs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index a0b872b74fe3..0f4f1e8fc480 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -172,7 +172,6 @@ static ssize_t uid_is_unique_show(struct device *dev, } static DEVICE_ATTR_RO(uid_is_unique); -#ifndef CONFIG_DMI /* analogous to smbios index */ static ssize_t index_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -202,7 +201,6 @@ static struct attribute_group zpci_ident_attr_group = { .attrs = zpci_ident_attrs, .is_visible = zpci_index_is_visible, }; -#endif static struct bin_attribute *zpci_bin_attrs[] = { &bin_attr_util_string, @@ -245,8 +243,6 @@ static struct attribute_group pfip_attr_group = { const struct attribute_group *zpci_attr_groups[] = { &zpci_attr_group, &pfip_attr_group, -#ifndef CONFIG_DMI &zpci_ident_attr_group, -#endif NULL, }; From cae74ba8c295bc41bda749ef27a8f2b3ee957a41 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 26 Apr 2024 12:02:15 +0200 Subject: [PATCH 49/54] s390/ftrace: Use unwinder instead of __builtin_return_address() Using __builtin_return_address(n) might return undefined values when used with values of n outside of the stack. This was noticed when __builtin_return_address() was called in ftrace on top level functions like the interrupt handlers. As this behaviour cannot be fixed, use the s390 stack unwinder and remove the ftrace compilation flags for unwind_bc.c and stacktrace.c to prevent the unwinding function polluting function traces. Another advantage is that this also works with clang. Reviewed-by: Heiko Carstens Signed-off-by: Sven Schnelle Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/ftrace.h | 8 ++------ arch/s390/kernel/Makefile | 2 ++ arch/s390/kernel/stacktrace.c | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 621f23d5ae30..77e479d44f1e 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -8,12 +8,8 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_CC_IS_CLANG -/* https://llvm.org/pr41424 */ -#define ftrace_return_address(n) 0UL -#else -#define ftrace_return_address(n) __builtin_return_address(n) -#endif +unsigned long return_address(unsigned int n); +#define ftrace_return_address(n) return_address(n) void ftrace_caller(void); diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index fa029d0dc28f..db2d9ba5a86d 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -11,6 +11,8 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) # Do not trace early setup code CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_stacktrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_unwind_bc.o = $(CC_FLAGS_FTRACE) endif diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 94f440e38303..7c294da45bf5 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -101,3 +101,22 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, } pagefault_enable(); } + +unsigned long return_address(unsigned int n) +{ + struct unwind_state state; + unsigned long addr; + + /* Increment to skip current stack entry */ + n++; + + unwind_for_each_frame(&state, NULL, NULL, 0) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (!n--) + return addr; + } + return 0; +} +EXPORT_SYMBOL_GPL(return_address); From 5f90003f09042b504d90ee38618cfd380ce16f4a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Thu, 25 Apr 2024 16:59:30 +0200 Subject: [PATCH 50/54] s390: vmlinux.lds.S: Drop .hash and .gnu.hash for !CONFIG_PIE_BUILD Sections .hash and .gnu.hash are only created when CONFIG_PIE_BUILD option is enabled. Drop these for the case CONFIG_PIE_BUILD is disabled. [ agordeev: Reworded the commit message ] Fixes: 778666df60f0 ("s390: compile relocatable kernel without -fPIE") Suggested-by: Alexander Gordeev Signed-off-by: Sumanth Korikkar Reviewed-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- arch/s390/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 023fade1da3b..163d64f300c1 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -209,13 +209,13 @@ SECTIONS .dynstr ALIGN(8) : { *(.dynstr) } -#endif .hash ALIGN(8) : { *(.hash) } .gnu.hash ALIGN(8) : { *(.gnu.hash) } +#endif . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ From 00cda11d3b2ea07295490b7d67942014f1cbc5c1 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Thu, 25 Apr 2024 16:59:31 +0200 Subject: [PATCH 51/54] s390: Compile kernel with -fPIC and link with -no-pie When the kernel is built with CONFIG_PIE_BUILD option enabled it uses dynamic symbols, for which the linker does not allow more than 64K number of entries. This can break features like kpatch. Hence, whenever possible the kernel is built with CONFIG_PIE_BUILD option disabled. For that support of unaligned symbols generated by linker scripts in the compiler is necessary. However, older compilers might lack such support. In that case the build process resorts to CONFIG_PIE_BUILD option-enabled build. Compile object files with -fPIC option and then link the kernel binary with -no-pie linker option. As result, the dynamic symbols are not generated and not only kpatch feature succeeds, but also the whole CONFIG_PIE_BUILD option-enabled code could be dropped. [ agordeev: Reworded the commit message ] Suggested-by: Ulrich Weigand Signed-off-by: Sumanth Korikkar Reviewed-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 12 ---------- arch/s390/Makefile | 9 ++------ arch/s390/boot/Makefile | 7 +----- arch/s390/boot/boot.h | 6 ----- arch/s390/boot/startup.c | 42 ---------------------------------- arch/s390/boot/vmlinux.lds.S | 2 -- arch/s390/kernel/vmlinux.lds.S | 33 -------------------------- 7 files changed, 3 insertions(+), 108 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a077ded1b9e6..7e7fe89c9b25 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -593,18 +593,6 @@ config RELOCATABLE Note: this option exists only for documentation purposes, please do not remove it. -config PIE_BUILD - def_bool CC_IS_CLANG && !$(cc-option,-munaligned-symbols) - help - If the compiler is unable to generate code that can manage unaligned - symbols, the kernel is linked as a position-independent executable - (PIE) and includes dynamic relocations that are processed early - during bootup. - - For kpatch functionality, it is recommended to build the kernel - without the PIE_BUILD option. PIE_BUILD is only enabled when the - compiler lacks proper support for handling unaligned symbols. - config RANDOMIZE_BASE bool "Randomize the address of the kernel image (KASLR)" default y diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 64821f54f1e0..f2b21c7a70ef 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -14,14 +14,9 @@ KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 -ifdef CONFIG_PIE_BUILD -KBUILD_CFLAGS += -fPIE -LDFLAGS_vmlinux := -pie -z notext -else -KBUILD_CFLAGS += $(call cc-option,-munaligned-symbols,) -LDFLAGS_vmlinux := --emit-relocs --discard-none +KBUILD_CFLAGS += -fPIC +LDFLAGS_vmlinux := -no-pie --emit-relocs --discard-none extra_tools := relocs -endif aflags_dwarf := -Wa,-gdwarf-2 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ ifndef CONFIG_AS_IS_LLVM diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index bd5d4d37a961..070c9b2e905f 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -37,8 +37,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += version.o pgm_check_info.o ctype.o ipl_data.o -obj-y += $(if $(CONFIG_PIE_BUILD),machine_kexec_reloc.o,relocs.o) +obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o @@ -49,9 +48,7 @@ targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4 targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all) -ifndef CONFIG_PIE_BUILD targets += relocs.S -endif OBJECTS := $(addprefix $(obj)/,$(obj-y)) OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all)) @@ -110,13 +107,11 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -ifndef CONFIG_PIE_BUILD CMD_RELOCS=arch/s390/tools/relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@ $(obj)/relocs.S: vmlinux FORCE $(call if_changed,relocs) -endif suffix-$(CONFIG_KERNEL_GZIP) := .gz suffix-$(CONFIG_KERNEL_BZIP2) := .bz2 diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 85da1c6cef4f..18027fdc92b0 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -24,14 +24,8 @@ struct vmlinux_info { unsigned long bootdata_size; unsigned long bootdata_preserved_off; unsigned long bootdata_preserved_size; -#ifdef CONFIG_PIE_BUILD - unsigned long dynsym_start; - unsigned long rela_dyn_start; - unsigned long rela_dyn_end; -#else unsigned long got_start; unsigned long got_end; -#endif unsigned long amode31_size; unsigned long init_mm_off; unsigned long swapper_pg_dir_off; diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 246d54499c20..467283b112cd 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -151,41 +151,6 @@ static void copy_bootdata(void) memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } -#ifdef CONFIG_PIE_BUILD -static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, - unsigned long offset, unsigned long phys_offset) -{ - Elf64_Rela *rela_start, *rela_end, *rela; - int r_type, r_sym, rc; - Elf64_Addr loc, val; - Elf64_Sym *dynsym; - - rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; - rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; - dynsym = (Elf64_Sym *) vmlinux.dynsym_start; - for (rela = rela_start; rela < rela_end; rela++) { - loc = rela->r_offset + phys_offset - __START_KERNEL; - val = rela->r_addend; - r_sym = ELF64_R_SYM(rela->r_info); - if (r_sym) { - if (dynsym[r_sym].st_shndx != SHN_UNDEF) - val += dynsym[r_sym].st_value + offset - __START_KERNEL; - } else { - /* - * 0 == undefined symbol table index (SHN_UNDEF), - * used for R_390_RELATIVE, only add KASLR offset - */ - val += offset - __START_KERNEL; - } - r_type = ELF64_R_TYPE(rela->r_info); - rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); - if (rc) - error("Unknown relocation type"); - } -} - -static void kaslr_adjust_got(unsigned long offset) {} -#else static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset, unsigned long phys_offset) { @@ -212,7 +177,6 @@ static void kaslr_adjust_got(unsigned long offset) for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++) *entry += offset - __START_KERNEL; } -#endif /* * Merge information from several sources into a single ident_map_size value. @@ -398,14 +362,8 @@ static void kaslr_adjust_vmlinux_info(long offset) { vmlinux.bootdata_off += offset; vmlinux.bootdata_preserved_off += offset; -#ifdef CONFIG_PIE_BUILD - vmlinux.rela_dyn_start += offset; - vmlinux.rela_dyn_end += offset; - vmlinux.dynsym_start += offset; -#else vmlinux.got_start += offset; vmlinux.got_end += offset; -#endif vmlinux.init_mm_off += offset; vmlinux.swapper_pg_dir_off += offset; vmlinux.invalid_pg_dir_off += offset; diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index d6454ec01e22..1fe5a1d3ff60 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -99,14 +99,12 @@ SECTIONS _decompressor_end = .; -#ifndef CONFIG_PIE_BUILD . = ALIGN(4); .vmlinux.relocs : { __vmlinux_relocs_64_start = .; *(.vmlinux.relocs_64) __vmlinux_relocs_64_end = .; } -#endif #ifdef CONFIG_KERNEL_UNCOMPRESSED . = ALIGN(PAGE_SIZE); diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 163d64f300c1..a1ce3925ec71 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -192,31 +192,6 @@ SECTIONS PERCPU_SECTION(0x100) -#ifdef CONFIG_PIE_BUILD - .dynsym ALIGN(8) : { - __dynsym_start = .; - *(.dynsym) - __dynsym_end = .; - } - .rela.dyn ALIGN(8) : { - __rela_dyn_start = .; - *(.rela*) - __rela_dyn_end = .; - } - .dynamic ALIGN(8) : { - *(.dynamic) - } - .dynstr ALIGN(8) : { - *(.dynstr) - } - .hash ALIGN(8) : { - *(.hash) - } - .gnu.hash ALIGN(8) : { - *(.gnu.hash) - } -#endif - . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ @@ -238,14 +213,8 @@ SECTIONS QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ QUAD(__boot_data_preserved_end - __boot_data_preserved_start) /* bootdata_preserved_size */ -#ifdef CONFIG_PIE_BUILD - QUAD(__dynsym_start) /* dynsym_start */ - QUAD(__rela_dyn_start) /* rela_dyn_start */ - QUAD(__rela_dyn_end) /* rela_dyn_end */ -#else QUAD(__got_start) /* got_start */ QUAD(__got_end) /* got_end */ -#endif QUAD(_eamode31 - _samode31) /* amode31_size */ QUAD(init_mm) QUAD(swapper_pg_dir) @@ -281,12 +250,10 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") -#ifndef CONFIG_PIE_BUILD .rela.dyn : { *(.rela.*) *(.rela_*) } ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") -#endif /* Sections to be discarded */ DISCARDS From 9ecaa2e94e602a3cbcbfe182535f6297f7630b98 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 30 Apr 2024 17:58:51 +0200 Subject: [PATCH 52/54] s390: Relocate vmlinux ELF data to virtual address space Currently kernel image relocation tables and other ELF data are set to base zero. Since kernel virtual and physical address spaces are uncoupled the kernel is mapped at the top of the virtual address space, hence making the information contained in vmlinux ELF tables inconsistent. That does not pose any issue with regard to the kernel booting and operation, but makes it difficult to use a generated vmlinux with some debugging tools (e.g. gdb). Relocate vmlinux image base address from zero to a base address in the virtual address space. It is the address that kernel is mapped to in cases KASLR is disabled. The vmlinux ELF header before and after this change looks like this: Elf file type is EXEC (Executable file) Entry point 0x100000 There are 3 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000001000 0x0000000000100000 0x0000000000100000 0x0000000001323378 0x0000000001323378 R E 0x1000 LOAD 0x0000000001325000 0x0000000001424000 0x0000000001424000 0x00000000003a4200 0x000000000048fdb8 RWE 0x1000 NOTE 0x00000000012a33b0 0x00000000013a23b0 0x00000000013a23b0 0x0000000000000054 0x0000000000000054 0x4 Elf file type is EXEC (Executable file) Entry point 0x3ffe0000000 There are 3 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000001000 0x000003ffe0000000 0x000003ffe0000000 0x0000000001323378 0x0000000001323378 R E 0x1000 LOAD 0x0000000001325000 0x000003ffe1324000 0x000003ffe1324000 0x00000000003a4200 0x000000000048fdb8 RWE 0x1000 NOTE 0x00000000012a33b0 0x000003ffe12a23b0 0x000003ffe12a23b0 0x0000000000000054 0x0000000000000054 0x4 Suggested-by: Vasily Gorbik Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 224ff9d433ea..7880b7b3173e 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -276,8 +276,8 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define AMODE31_SIZE (3 * PAGE_SIZE) #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#define __START_KERNEL 0x100000 -#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE +#define __START_KERNEL CONFIG_KERNEL_IMAGE_BASE +#define __NO_KASLR_START_KERNEL __START_KERNEL #define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) #endif /* _S390_PAGE_H */ From cc4edb92f55aea6eb26930087c8075bdf9967b1b Mon Sep 17 00:00:00 2001 From: Nina Schoetterl-Glausch Date: Mon, 29 Apr 2024 19:15:12 +0200 Subject: [PATCH 53/54] KVM: s390: vsie: Use virt_to_phys for crypto control block The address of the crypto control block in the (shadow) SIE block is absolute/physical. Convert from virtual to physical when shadowing the guest's control block during VSIE. Signed-off-by: Nina Schoetterl-Glausch Reviewed-by: Christian Borntraeger Acked-by: Alexander Gordeev Link: https://lore.kernel.org/r/20240429171512.879215-1-nsg@linux.ibm.com Signed-off-by: Alexander Gordeev --- arch/s390/kvm/vsie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d8527a046cf7..c9ecae830634 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -362,7 +362,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) case -EACCES: return set_validity_icpt(scb_s, 0x003CU); } - scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2; + scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2; return 0; } From 1812dc9c334f98227c65bc9c475f16fb6840a94b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sun, 5 May 2024 12:47:10 +0200 Subject: [PATCH 54/54] Revert "s390: Relocate vmlinux ELF data to virtual address space" This reverts commit 9ecaa2e94e602a3cbcbfe182535f6297f7630b98. In case CONFIG_MODULES kernel option is not defined the build fails with the following linker error: block/partitions/ibm.o: in function `ibm_partition': ibm.c:(.text+0x8bc): relocation truncated to fit: R_390_PLT32DBL against undefined symbol `dasd_biodasdinfo' Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/page.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 7880b7b3173e..224ff9d433ea 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -276,8 +276,8 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #define AMODE31_SIZE (3 * PAGE_SIZE) #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#define __START_KERNEL CONFIG_KERNEL_IMAGE_BASE -#define __NO_KASLR_START_KERNEL __START_KERNEL +#define __START_KERNEL 0x100000 +#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE #define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE) #endif /* _S390_PAGE_H */