From f66ae597411cb0d11ed3c281ef2ae809c2e25cb0 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:57:54 +0100 Subject: [PATCH 01/64] drivers: perf: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel from sbi_pmu_sysctl_table Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20240328-jag-sysctl_remset_misc-v1-7-47c1463b3af2@samsung.com Signed-off-by: Will Deacon --- drivers/perf/riscv_pmu_sbi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 8cbe6e5f9c39..5aef5a8737b2 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1043,7 +1043,6 @@ static struct ctl_table sbi_pmu_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, - { } }; static int pmu_sbi_device_probe(struct platform_device *pdev) From 105350fe07862c7f919828250f306ec674240b66 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 4 Apr 2024 19:59:23 +0300 Subject: [PATCH 02/64] drivers/perf: thunderx2_pmu: Replace open coded acpi_match_acpi_device() Replace open coded acpi_match_acpi_device() in get_tx2_pmu_type(). Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240404170016.2466898-1-andriy.shevchenko@linux.intel.com Signed-off-by: Will Deacon --- drivers/perf/thunderx2_pmu.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index e16d10c763de..f03aa85072ec 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -504,24 +504,19 @@ static void tx2_uncore_event_update(struct perf_event *event) static enum tx2_uncore_type get_tx2_pmu_type(struct acpi_device *adev) { - int i = 0; - struct acpi_tx2_pmu_device { - __u8 id[ACPI_ID_LEN]; - enum tx2_uncore_type type; - } devices[] = { + struct acpi_device_id devices[] = { {"CAV901D", PMU_TYPE_L3C}, {"CAV901F", PMU_TYPE_DMC}, {"CAV901E", PMU_TYPE_CCPI2}, - {"", PMU_TYPE_INVALID} + {} }; + const struct acpi_device_id *id; - while (devices[i].type != PMU_TYPE_INVALID) { - if (!strcmp(acpi_device_hid(adev), devices[i].id)) - break; - i++; - } + id = acpi_match_acpi_device(devices, adev); + if (!id) + return PMU_TYPE_INVALID; - return devices[i].type; + return (enum tx2_uncore_type)id->driver_data; } static bool tx2_uncore_validate_event(struct pmu *pmu, From 897fa2c38c076c801bd1f1238af0af927e339c8f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 3 Apr 2024 23:59:41 +0800 Subject: [PATCH 03/64] cpumask: add cpumask_any_and_but() In some cases, it's useful to be able to select a random cpu from the intersection of two masks, excluding a particular CPU. For example, in some systems an uncore PMU is shared by a subset of CPUs, and management of this PMU is assigned to some arbitrary CPU in this set. Whenever the management CPU is hotplugged out, we wish to migrate responsibility to another arbitrary CPU which is both in this set and online. Today we can use cpumask_any_and() to select an arbitrary CPU in the intersection of two masks. We can also use cpumask_any_but() to select any arbitrary cpu in a mask excluding, a particular CPU. To do both, we either need to use a temporary cpumask, which is wasteful, or use some lower-level cpumask helpers, which can be unclear. This patch adds a new cpumask_any_and_but() to cater for these cases. Signed-off-by: Mark Rutland Cc: Thomas Gleixner Cc: Andrew Morton Cc: Peter Zijlstra Cc: Rusty Russell Cc: linux-kernel@vger.kernel.org Signed-off-by: Dawei Li Acked-by: Yury Norov Link: https://lore.kernel.org/r/20240403155950.2068109-2-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- include/linux/cpumask.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1c29947db848..121f3ac757ff 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -388,6 +388,29 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) return i; } +/** + * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one. + * @mask1: the first input cpumask + * @mask2: the second input cpumask + * @cpu: the cpu to ignore + * + * Returns >= nr_cpu_ids if no cpus set. + */ +static inline +unsigned int cpumask_any_and_but(const struct cpumask *mask1, + const struct cpumask *mask2, + unsigned int cpu) +{ + unsigned int i; + + cpumask_check(cpu); + i = cpumask_first_and(mask1, mask2); + if (i != cpu) + return i; + + return cpumask_next_and(cpu, mask1, mask2); +} + /** * cpumask_nth - get the Nth cpu in a cpumask * @srcp: the cpumask pointer From 2f6589df124ee8cbe1772353e533c3fcc4319a24 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:42 +0800 Subject: [PATCH 04/64] perf/alibaba_uncore_drw: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20240403155950.2068109-3-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/alibaba_uncore_drw_pmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c index a9277dcf90ce..d4d14b65c4a5 100644 --- a/drivers/perf/alibaba_uncore_drw_pmu.c +++ b/drivers/perf/alibaba_uncore_drw_pmu.c @@ -746,18 +746,14 @@ static int ali_drw_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) struct ali_drw_pmu_irq *irq; struct ali_drw_pmu *drw_pmu; unsigned int target; - int ret; - cpumask_t node_online_cpus; irq = hlist_entry_safe(node, struct ali_drw_pmu_irq, node); if (cpu != irq->cpu) return 0; - ret = cpumask_and(&node_online_cpus, - cpumask_of_node(cpu_to_node(cpu)), cpu_online_mask); - if (ret) - target = cpumask_any_but(&node_online_cpus, cpu); - else + target = cpumask_any_and_but(cpumask_of_node(cpu_to_node(cpu)), + cpu_online_mask, cpu); + if (target >= nr_cpu_ids) target = cpumask_any_but(cpu_online_mask, cpu); if (target >= nr_cpu_ids) From 60c73240f304a654b66811f7f56a3325201f46de Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:43 +0800 Subject: [PATCH 05/64] perf/arm-cmn: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Link: https://lore.kernel.org/r/20240403155950.2068109-4-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 7ef9c7e4836b..6bfb0c4a1287 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1950,20 +1950,20 @@ static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_no struct arm_cmn *cmn; unsigned int target; int node; - cpumask_t mask; cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node); if (cpu != cmn->cpu) return 0; node = dev_to_node(cmn->dev); - if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) && - cpumask_andnot(&mask, &mask, cpumask_of(cpu))) - target = cpumask_any(&mask); - else + + target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu); + if (target >= nr_cpu_ids) target = cpumask_any_but(cpu_online_mask, cpu); + if (target < nr_cpu_ids) arm_cmn_migrate(cmn, target); + return 0; } From b5310fa1fe8e29e82dd88ef23e2f04ac533548e1 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:44 +0800 Subject: [PATCH 06/64] perf/arm_cspmu: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Link: https://lore.kernel.org/r/20240403155950.2068109-5-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/arm_cspmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index b9a252272f1e..fd1004251665 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -1322,8 +1322,7 @@ static int arm_cspmu_cpu_online(unsigned int cpu, struct hlist_node *node) static int arm_cspmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) { - int dst; - struct cpumask online_supported; + unsigned int dst; struct arm_cspmu *cspmu = hlist_entry_safe(node, struct arm_cspmu, cpuhp_node); @@ -1333,9 +1332,8 @@ static int arm_cspmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) return 0; /* Choose a new CPU to migrate ownership of the PMU to */ - cpumask_and(&online_supported, &cspmu->associated_cpus, - cpu_online_mask); - dst = cpumask_any_but(&online_supported, cpu); + dst = cpumask_any_and_but(&cspmu->associated_cpus, + cpu_online_mask, cpu); if (dst >= nr_cpu_ids) return 0; From bea2a13b207ef48732daf329564101a07df14e3a Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:45 +0800 Subject: [PATCH 07/64] perf/arm_dsu: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Link: https://lore.kernel.org/r/20240403155950.2068109-6-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index bae3ca37f846..adc0bbb5fafe 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -230,15 +230,6 @@ static const struct attribute_group *dsu_pmu_attr_groups[] = { NULL, }; -static int dsu_pmu_get_online_cpu_any_but(struct dsu_pmu *dsu_pmu, int cpu) -{ - struct cpumask online_supported; - - cpumask_and(&online_supported, - &dsu_pmu->associated_cpus, cpu_online_mask); - return cpumask_any_but(&online_supported, cpu); -} - static inline bool dsu_pmu_counter_valid(struct dsu_pmu *dsu_pmu, u32 idx) { return (idx < dsu_pmu->num_counters) || @@ -827,14 +818,16 @@ static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) static int dsu_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) { - int dst; - struct dsu_pmu *dsu_pmu = hlist_entry_safe(node, struct dsu_pmu, - cpuhp_node); + struct dsu_pmu *dsu_pmu; + unsigned int dst; + + dsu_pmu = hlist_entry_safe(node, struct dsu_pmu, cpuhp_node); if (!cpumask_test_and_clear_cpu(cpu, &dsu_pmu->active_cpu)) return 0; - dst = dsu_pmu_get_online_cpu_any_but(dsu_pmu, cpu); + dst = cpumask_any_and_but(&dsu_pmu->associated_cpus, + cpu_online_mask, cpu); /* If there are no active CPUs in the DSU, leave IRQ disabled */ if (dst >= nr_cpu_ids) return 0; From cf276ee46bc44aa188d6a9ea36f83118f48bac67 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:46 +0800 Subject: [PATCH 08/64] perf/dwc_pcie: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20240403155950.2068109-7-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 957058ad0099..c5e328f23841 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -690,9 +690,8 @@ static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_n { struct dwc_pcie_pmu *pcie_pmu; struct pci_dev *pdev; - int node; - cpumask_t mask; unsigned int target; + int node; pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node); /* Nothing to do if this CPU doesn't own the PMU */ @@ -702,10 +701,9 @@ static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_n pcie_pmu->on_cpu = -1; pdev = pcie_pmu->pdev; node = dev_to_node(&pdev->dev); - if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) && - cpumask_andnot(&mask, &mask, cpumask_of(cpu))) - target = cpumask_any(&mask); - else + + target = cpumask_any_and_but(cpumask_of_node(node), cpu_online_mask, cpu); + if (target >= nr_cpu_ids) target = cpumask_any_but(cpu_online_mask, cpu); if (target >= nr_cpu_ids) { From d7df79e6af29f99d149b8995e68813d77d381e63 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:47 +0800 Subject: [PATCH 09/64] perf/hisi_pcie: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240403155950.2068109-8-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_pcie_pmu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c index 5d1f0e9fdb08..06b192cc31d5 100644 --- a/drivers/perf/hisilicon/hisi_pcie_pmu.c +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -673,7 +673,6 @@ static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node); unsigned int target; - cpumask_t mask; int numa_node; /* Nothing to do if this CPU doesn't own the PMU */ @@ -684,10 +683,10 @@ static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) /* Choose a local CPU from all online cpus. */ numa_node = dev_to_node(&pcie_pmu->pdev->dev); - if (cpumask_and(&mask, cpumask_of_node(numa_node), cpu_online_mask) && - cpumask_andnot(&mask, &mask, cpumask_of(cpu))) - target = cpumask_any(&mask); - else + + target = cpumask_any_and_but(cpumask_of_node(numa_node), + cpu_online_mask, cpu); + if (target >= nr_cpu_ids) target = cpumask_any_but(cpu_online_mask, cpu); if (target >= nr_cpu_ids) { From b78d0fa25462405f0f123eb827a9e1bc0e595d52 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:48 +0800 Subject: [PATCH 10/64] perf/hisi_uncore: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240403155950.2068109-9-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 04031450d5fe..ccc9191ad1b6 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -504,7 +504,6 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { struct hisi_pmu *hisi_pmu = hlist_entry_safe(node, struct hisi_pmu, node); - cpumask_t pmu_online_cpus; unsigned int target; if (!cpumask_test_and_clear_cpu(cpu, &hisi_pmu->associated_cpus)) @@ -518,9 +517,8 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) hisi_pmu->on_cpu = -1; /* Choose a new CPU to migrate ownership of the PMU to */ - cpumask_and(&pmu_online_cpus, &hisi_pmu->associated_cpus, - cpu_online_mask); - target = cpumask_any_but(&pmu_online_cpus, cpu); + target = cpumask_any_and_but(&hisi_pmu->associated_cpus, + cpu_online_mask, cpu); if (target >= nr_cpu_ids) return 0; From fc85cee97029dee3acb4dcefe4af01b8f8022699 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:49 +0800 Subject: [PATCH 11/64] perf/qcom_l2: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Link: https://lore.kernel.org/r/20240403155950.2068109-10-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/qcom_l2_pmu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index 148df5ae8ef8..b5a44dc1dc3a 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -801,9 +801,8 @@ static int l2cache_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) static int l2cache_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) { - struct cluster_pmu *cluster; struct l2cache_pmu *l2cache_pmu; - cpumask_t cluster_online_cpus; + struct cluster_pmu *cluster; unsigned int target; l2cache_pmu = hlist_entry_safe(node, struct l2cache_pmu, node); @@ -820,9 +819,8 @@ static int l2cache_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) cluster->on_cpu = -1; /* Any other CPU for this cluster which is still online */ - cpumask_and(&cluster_online_cpus, &cluster->cluster_cpus, - cpu_online_mask); - target = cpumask_any_but(&cluster_online_cpus, cpu); + target = cpumask_any_and_but(&cluster->cluster_cpus, + cpu_online_mask, cpu); if (target >= nr_cpu_ids) { disable_irq(cluster->irq); return 0; From 595275ca498485667039e8c190453a9a684687cb Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 3 Apr 2024 23:59:50 +0800 Subject: [PATCH 12/64] perf/thunderx2: Avoid placing cpumask on the stack In general it's preferable to avoid placing cpumasks on the stack, as for large values of NR_CPUS these can consume significant amounts of stack space and make stack overflows more likely. Use cpumask_any_and_but() to avoid the need for a temporary cpumask on the stack. Suggested-by: Mark Rutland Reviewed-by: Mark Rutland Signed-off-by: Dawei Li Link: https://lore.kernel.org/r/20240403155950.2068109-11-dawei.li@shingroup.cn Signed-off-by: Will Deacon --- drivers/perf/thunderx2_pmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index f03aa85072ec..33e8ff3e5265 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -927,9 +927,8 @@ static int tx2_uncore_pmu_online_cpu(unsigned int cpu, static int tx2_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *hpnode) { - int new_cpu; struct tx2_uncore_pmu *tx2_pmu; - struct cpumask cpu_online_mask_temp; + unsigned int new_cpu; tx2_pmu = hlist_entry_safe(hpnode, struct tx2_uncore_pmu, hpnode); @@ -940,11 +939,8 @@ static int tx2_uncore_pmu_offline_cpu(unsigned int cpu, if (tx2_pmu->hrtimer_callback) hrtimer_cancel(&tx2_pmu->hrtimer); - cpumask_copy(&cpu_online_mask_temp, cpu_online_mask); - cpumask_clear_cpu(cpu, &cpu_online_mask_temp); - new_cpu = cpumask_any_and( - cpumask_of_node(tx2_pmu->node), - &cpu_online_mask_temp); + new_cpu = cpumask_any_and_but(cpumask_of_node(tx2_pmu->node), + cpu_online_mask, cpu); tx2_pmu->cpu = new_cpu; if (new_cpu >= nr_cpu_ids) From 8f9f5041c64600b01b71f29fb8e2121e45bfb719 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 9 Apr 2024 18:15:17 +0100 Subject: [PATCH 13/64] perf/arm-cmn: Set PMU device parent Now that perf supports giving the PMU device a parent, we can use our platform device to make the relationship between CMN instances and PMU IDs trivially discoverable, from either nominal direction: root@crazy-taxi:~# ls /sys/devices/platform/ARMHC600:00 | grep cmn arm_cmn_0 root@crazy-taxi:~# realpath /sys/bus/event_source/devices/arm_cmn_0/.. /sys/devices/platform/ARMHC600:00 Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/25d4428df1ddad966c74a3ed60171cd3ca6c8b66.1712682917.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 6bfb0c4a1287..e26ad1d3ed0b 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2482,6 +2482,7 @@ static int arm_cmn_probe(struct platform_device *pdev) cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); cmn->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = cmn->dev, .attr_groups = arm_cmn_attr_groups, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .task_ctx_nr = perf_invalid_context, From 98631c4904bf6380834c8585ce50451f00eb5389 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Thu, 14 Mar 2024 14:38:19 +0800 Subject: [PATCH 14/64] arm64: Remove unnecessary irqflags alternative.h include Since commit 20af807d806d ("arm64: Avoid cpus_have_const_cap() for ARM64_HAS_GIC_PRIO_MASKING"), the alternative.h include is not used, so remove it. Fixes: 20af807d806d ("arm64: Avoid cpus_have_const_cap() for ARM64_HAS_GIC_PRIO_MASKING") Signed-off-by: Jinjie Ruan Link: https://lore.kernel.org/r/20240314063819.2636445-1-ruanjinjie@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/irqflags.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 0a7186a93882..d4d7451c2c12 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -5,7 +5,6 @@ #ifndef __ASM_IRQFLAGS_H #define __ASM_IRQFLAGS_H -#include #include #include #include From e07255d69702bc9131427fda8f9749355b10780f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:51 +1000 Subject: [PATCH 15/64] arm64: tlb: Improve __TLBI_VADDR_RANGE() The macro returns the operand of TLBI RANGE instruction. A mask needs to be applied to each individual field upon producing the operand, to avoid the adjacent fields can interfere with each other when invalid arguments have been provided. The code looks more tidy at least with a mask and FIELD_PREP(). Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-3-gshan@redhat.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a75de2665d84..243d71f7bc1f 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -142,17 +142,24 @@ static inline unsigned long get_trans_granule(void) * EL1, Inner Shareable". * */ -#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ - ({ \ - unsigned long __ta = (baddr); \ - unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ - __ta &= GENMASK_ULL(36, 0); \ - __ta |= __ttl << 37; \ - __ta |= (unsigned long)(num) << 39; \ - __ta |= (unsigned long)(scale) << 44; \ - __ta |= get_trans_granule() << 46; \ - __ta |= (unsigned long)(asid) << 48; \ - __ta; \ +#define TLBIR_ASID_MASK GENMASK_ULL(63, 48) +#define TLBIR_TG_MASK GENMASK_ULL(47, 46) +#define TLBIR_SCALE_MASK GENMASK_ULL(45, 44) +#define TLBIR_NUM_MASK GENMASK_ULL(43, 39) +#define TLBIR_TTL_MASK GENMASK_ULL(38, 37) +#define TLBIR_BADDR_MASK GENMASK_ULL(36, 0) + +#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ + ({ \ + unsigned long __ta = 0; \ + unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ + __ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr); \ + __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \ + __ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \ + __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \ + __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \ + __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \ + __ta; \ }) /* These macros are used by the TLBI RANGE feature. */ From 73301e464a72a0d007d0d4e0f4d3dab5c58125bf Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 5 Apr 2024 13:58:52 +1000 Subject: [PATCH 16/64] arm64: tlb: Allow range operation for MAX_TLBI_RANGE_PAGES MAX_TLBI_RANGE_PAGES pages is covered by SCALE#3 and NUM#31 and it's supported now. Allow TLBI RANGE operation when the number of pages is equal to MAX_TLBI_RANGE_PAGES in __flush_tlb_range_nosync(). Suggested-by: Marc Zyngier Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Reviewed-by: Shaoqin Huang Link: https://lore.kernel.org/r/20240405035852.1532010-4-gshan@redhat.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 243d71f7bc1f..95fbc8c05607 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -446,11 +446,11 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, * When not uses TLB range ops, we can handle up to * (MAX_DVM_OPS - 1) pages; * When uses TLB range ops, we can handle up to - * (MAX_TLBI_RANGE_PAGES - 1) pages. + * MAX_TLBI_RANGE_PAGES pages. */ if ((!system_supports_tlb_range() && (end - start) >= (MAX_DVM_OPS * stride)) || - pages >= MAX_TLBI_RANGE_PAGES) { + pages > MAX_TLBI_RANGE_PAGES) { flush_tlb_mm(vma->vm_mm); return; } From b782e8d07baac95a5ce3f8773cc61f4ed7d0ccbc Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Thu, 11 Apr 2024 20:30:30 +0800 Subject: [PATCH 17/64] arm64: arm_pmuv3: Correctly extract and check the PMUVer Currently we're using "sbfx" to extract the PMUVer from ID_AA64DFR0_EL1 and skip the init/reset if no PMU present when the extracted PMUVer is negative or is zero. However for PMUv3p8 the PMUVer will be 0b1000 and PMUVer extracted by "sbfx" will always be negative and we'll skip the init/reset in __init_el2_debug/reset_pmuserenr_el0 unexpectedly. So this patch use "ubfx" instead of "sbfx" to extract the PMUVer. If the PMUVer is implementation defined (0b1111) or not implemented(0b0000) then skip the reset/init. Previously we'll also skip the init/reset if the PMUVer is higher than the version we known (currently PMUv3p9), with this patch we'll only skip if the PMU is not implemented or implementation defined. This keeps consistence with how we probe the PMU in the driver with pmuv3_implemented(). Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20240411123030.7201-1-yangyicong@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 7 ++++--- arch/arm64/include/asm/el2_setup.h | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ab8b396428da..9ecd076ba08f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -480,9 +480,10 @@ alternative_endif */ .macro reset_pmuserenr_el0, tmpreg mrs \tmpreg, id_aa64dfr0_el1 - sbfx \tmpreg, \tmpreg, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 - cmp \tmpreg, #1 // Skip if no PMU present - b.lt 9000f + ubfx \tmpreg, \tmpreg, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 + cmp \tmpreg, #ID_AA64DFR0_EL1_PMUVer_NI + ccmp \tmpreg, #ID_AA64DFR0_EL1_PMUVer_IMP_DEF, #4, ne + b.eq 9000f // Skip if no PMU present or IMP_DEF msr pmuserenr_el0, xzr // Disable PMU access from EL0 9000: .endm diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index b7afaa026842..e4546b29dd0c 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -59,13 +59,14 @@ .macro __init_el2_debug mrs x1, id_aa64dfr0_el1 - sbfx x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 - cmp x0, #1 - b.lt .Lskip_pmu_\@ // Skip if no PMU present + ubfx x0, x1, #ID_AA64DFR0_EL1_PMUVer_SHIFT, #4 + cmp x0, #ID_AA64DFR0_EL1_PMUVer_NI + ccmp x0, #ID_AA64DFR0_EL1_PMUVer_IMP_DEF, #4, ne + b.eq .Lskip_pmu_\@ // Skip if no PMU present or IMP_DEF mrs x0, pmcr_el0 // Disable debug access traps ubfx x0, x0, #11, #5 // to EL2 and allow access to .Lskip_pmu_\@: - csel x2, xzr, x0, lt // all PMU counters from EL1 + csel x2, xzr, x0, eq // all PMU counters from EL1 /* Statistical profiling */ ubfx x0, x1, #ID_AA64DFR0_EL1_PMSVer_SHIFT, #4 From 0dc1670bd0cef948ce782d6b3902af9bf8604beb Mon Sep 17 00:00:00 2001 From: Simon Glass Date: Fri, 29 Mar 2024 16:28:35 +1300 Subject: [PATCH 18/64] arm64: Add BOOT_TARGETS variable Add a new variable containing a list of possible targets. Mark them as phony. This matches the approach taken for arch/arm Signed-off-by: Simon Glass Reviewed-by: Nicolas Schier Link: https://lore.kernel.org/r/20240329032836.141899-2-sjg@chromium.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 0e075d3c546b..1217d97998ac 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -154,6 +154,10 @@ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a # Default target when executing plain make boot := arch/arm64/boot +BOOT_TARGETS := Image vmlinuz.efi + +PHONY += $(BOOT_TARGETS) + ifeq ($(CONFIG_EFI_ZBOOT),) KBUILD_IMAGE := $(boot)/Image.gz else @@ -163,7 +167,7 @@ endif all: $(notdir $(KBUILD_IMAGE)) vmlinuz.efi: Image -Image vmlinuz.efi: vmlinux +$(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ Image.%: Image From 7a23b027ec17b2eb9c8ad9b09006502f3fa38215 Mon Sep 17 00:00:00 2001 From: Simon Glass Date: Fri, 29 Mar 2024 16:28:36 +1300 Subject: [PATCH 19/64] arm64: boot: Support Flat Image Tree Add a script which produces a Flat Image Tree (FIT), a single file containing the built kernel and associated devicetree files. Compression defaults to gzip which gives a good balance of size and performance. The files compress from about 86MB to 24MB using this approach. The FIT can be used by bootloaders which support it, such as U-Boot and Linuxboot. It permits automatic selection of the correct devicetree, matching the compatible string of the running board with the closest compatible string in the FIT. There is no need for filenames or other workarounds. Add a 'make image.fit' build target for arm64, as well. The FIT can be examined using 'dumpimage -l'. This uses the 'dtbs-list' file but processes only .dtb files, ignoring the overlay .dtbo files. This features requires pylibfdt (use 'pip install libfdt'). It also requires compression utilities for the algorithm being used. Supported compression options are the same as the Image.xxx files. Use FIT_COMPRESSION to select an algorithm other than gzip. While FIT supports a ramdisk / initrd, no attempt is made to support this here, since it must be built separately from the Linux build. Signed-off-by: Simon Glass Acked-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240329032836.141899-3-sjg@chromium.org Signed-off-by: Will Deacon --- Documentation/process/changes.rst | 9 + MAINTAINERS | 7 + arch/arm64/Makefile | 7 +- arch/arm64/boot/.gitignore | 1 + arch/arm64/boot/Makefile | 6 +- scripts/Makefile.lib | 16 ++ scripts/make_fit.py | 290 ++++++++++++++++++++++++++++++ 7 files changed, 333 insertions(+), 3 deletions(-) create mode 100755 scripts/make_fit.py diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 7ef8de58f7f8..3a39395bd9d3 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -62,6 +62,7 @@ Sphinx\ [#f1]_ 2.4.4 sphinx-build --version cpio any cpio --version GNU tar 1.28 tar --version gtags (optional) 6.6.5 gtags --version +mkimage (optional) 2017.01 mkimage --version ====================== =============== ======================================== .. [#f1] Sphinx is needed only to build the Kernel documentation @@ -189,6 +190,14 @@ The kernel build requires GNU GLOBAL version 6.6.5 or later to generate tag files through ``make gtags``. This is due to its use of the gtags ``-C (--directory)`` flag. +mkimage +------- + +This tool is used when building a Flat Image Tree (FIT), commonly used on ARM +platforms. The tool is available via the ``u-boot-tools`` package or can be +built from the U-Boot source code. See the instructions at +https://docs.u-boot.org/en/latest/build/tools.html#building-tools-for-linux + System utilities **************** diff --git a/MAINTAINERS b/MAINTAINERS index aea47e04c3a5..9db17a2fe3b0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3051,6 +3051,13 @@ F: drivers/mmc/host/sdhci-of-arasan.c N: zynq N: xilinx +ARM64 FIT SUPPORT +M: Simon Glass +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +S: Maintained +F: arch/arm64/boot/Makefile +F: scripts/make_fit.py + ARM64 PORT (AARCH64 ARCHITECTURE) M: Catalin Marinas M: Will Deacon diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 1217d97998ac..b8b1d4f4a572 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -154,7 +154,7 @@ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a # Default target when executing plain make boot := arch/arm64/boot -BOOT_TARGETS := Image vmlinuz.efi +BOOT_TARGETS := Image vmlinuz.efi image.fit PHONY += $(BOOT_TARGETS) @@ -166,7 +166,9 @@ endif all: $(notdir $(KBUILD_IMAGE)) -vmlinuz.efi: Image +image.fit: dtbs + +vmlinuz.efi image.fit: Image $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ @@ -219,6 +221,7 @@ virtconfig: define archhelp echo '* Image.gz - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)' echo ' Image - Uncompressed kernel image (arch/$(ARCH)/boot/Image)' + echo ' image.fit - Flat Image Tree (arch/$(ARCH)/boot/image.fit)' echo ' install - Install uncompressed kernel' echo ' zinstall - Install compressed kernel' echo ' Install using (your) ~/bin/installkernel or' diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore index af5dc61f8b43..abaae9de1bdd 100644 --- a/arch/arm64/boot/.gitignore +++ b/arch/arm64/boot/.gitignore @@ -2,3 +2,4 @@ Image Image.gz vmlinuz* +image.fit diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile index a5a787371117..607a67a649c4 100644 --- a/arch/arm64/boot/Makefile +++ b/arch/arm64/boot/Makefile @@ -16,7 +16,8 @@ OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S -targets := Image Image.bz2 Image.gz Image.lz4 Image.lzma Image.lzo Image.zst +targets := Image Image.bz2 Image.gz Image.lz4 Image.lzma Image.lzo \ + Image.zst image.fit $(obj)/Image: vmlinux FORCE $(call if_changed,objcopy) @@ -39,6 +40,9 @@ $(obj)/Image.lzo: $(obj)/Image FORCE $(obj)/Image.zst: $(obj)/Image FORCE $(call if_changed,zstd) +$(obj)/image.fit: $(obj)/Image $(obj)/dts/dtbs-list FORCE + $(call if_changed,fit) + EFI_ZBOOT_PAYLOAD := Image EFI_ZBOOT_BFD_TARGET := elf64-littleaarch64 EFI_ZBOOT_MACH_TYPE := ARM64 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3179747cbd2c..afa1099b6b8e 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -504,6 +504,22 @@ quiet_cmd_uimage = UIMAGE $@ -a $(UIMAGE_LOADADDR) -e $(UIMAGE_ENTRYADDR) \ -n '$(UIMAGE_NAME)' -d $< $@ +# Flat Image Tree (FIT) +# This allows for packaging of a kernel and all devicetrees files, using +# compression. +# --------------------------------------------------------------------------- + +MAKE_FIT := $(srctree)/scripts/make_fit.py + +# Use this to override the compression algorithm +FIT_COMPRESSION ?= gzip + +quiet_cmd_fit = FIT $@ + cmd_fit = $(MAKE_FIT) -o $@ --arch $(UIMAGE_ARCH) --os linux \ + --name '$(UIMAGE_NAME)' \ + $(if $(findstring 1,$(KBUILD_VERBOSE)),-v) \ + --compress $(FIT_COMPRESSION) -k $< @$(word 2,$^) + # XZ # --------------------------------------------------------------------------- # Use xzkern to compress the kernel image and xzmisc to compress other things. diff --git a/scripts/make_fit.py b/scripts/make_fit.py new file mode 100755 index 000000000000..3de90c5a094b --- /dev/null +++ b/scripts/make_fit.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0+ +# +# Copyright 2024 Google LLC +# Written by Simon Glass +# + +"""Build a FIT containing a lot of devicetree files + +Usage: + make_fit.py -A arm64 -n 'Linux-6.6' -O linux + -o arch/arm64/boot/image.fit -k /tmp/kern/arch/arm64/boot/image.itk + @arch/arm64/boot/dts/dtbs-list -E -c gzip + +Creates a FIT containing the supplied kernel and a set of devicetree files, +either specified individually or listed in a file (with an '@' prefix). + +Use -E to generate an external FIT (where the data is placed after the +FIT data structure). This allows parsing of the data without loading +the entire FIT. + +Use -c to compress the data, using bzip2, gzip, lz4, lzma, lzo and +zstd algorithms. + +The resulting FIT can be booted by bootloaders which support FIT, such +as U-Boot, Linuxboot, Tianocore, etc. + +Note that this tool does not yet support adding a ramdisk / initrd. +""" + +import argparse +import collections +import os +import subprocess +import sys +import tempfile +import time + +import libfdt + + +# Tool extension and the name of the command-line tools +CompTool = collections.namedtuple('CompTool', 'ext,tools') + +COMP_TOOLS = { + 'bzip2': CompTool('.bz2', 'bzip2'), + 'gzip': CompTool('.gz', 'pigz,gzip'), + 'lz4': CompTool('.lz4', 'lz4'), + 'lzma': CompTool('.lzma', 'lzma'), + 'lzo': CompTool('.lzo', 'lzop'), + 'zstd': CompTool('.zstd', 'zstd'), +} + + +def parse_args(): + """Parse the program ArgumentParser + + Returns: + Namespace object containing the arguments + """ + epilog = 'Build a FIT from a directory tree containing .dtb files' + parser = argparse.ArgumentParser(epilog=epilog, fromfile_prefix_chars='@') + parser.add_argument('-A', '--arch', type=str, required=True, + help='Specifies the architecture') + parser.add_argument('-c', '--compress', type=str, default='none', + help='Specifies the compression') + parser.add_argument('-E', '--external', action='store_true', + help='Convert the FIT to use external data') + parser.add_argument('-n', '--name', type=str, required=True, + help='Specifies the name') + parser.add_argument('-o', '--output', type=str, required=True, + help='Specifies the output file (.fit)') + parser.add_argument('-O', '--os', type=str, required=True, + help='Specifies the operating system') + parser.add_argument('-k', '--kernel', type=str, required=True, + help='Specifies the (uncompressed) kernel input file (.itk)') + parser.add_argument('-v', '--verbose', action='store_true', + help='Enable verbose output') + parser.add_argument('dtbs', type=str, nargs='*', + help='Specifies the devicetree files to process') + + return parser.parse_args() + + +def setup_fit(fsw, name): + """Make a start on writing the FIT + + Outputs the root properties and the 'images' node + + Args: + fsw (libfdt.FdtSw): Object to use for writing + name (str): Name of kernel image + """ + fsw.INC_SIZE = 65536 + fsw.finish_reservemap() + fsw.begin_node('') + fsw.property_string('description', f'{name} with devicetree set') + fsw.property_u32('#address-cells', 1) + + fsw.property_u32('timestamp', int(time.time())) + fsw.begin_node('images') + + +def write_kernel(fsw, data, args): + """Write out the kernel image + + Writes a kernel node along with the required properties + + Args: + fsw (libfdt.FdtSw): Object to use for writing + data (bytes): Data to write (possibly compressed) + args (Namespace): Contains necessary strings: + arch: FIT architecture, e.g. 'arm64' + fit_os: Operating Systems, e.g. 'linux' + name: Name of OS, e.g. 'Linux-6.6.0-rc7' + compress: Compression algorithm to use, e.g. 'gzip' + """ + with fsw.add_node('kernel'): + fsw.property_string('description', args.name) + fsw.property_string('type', 'kernel_noload') + fsw.property_string('arch', args.arch) + fsw.property_string('os', args.os) + fsw.property_string('compression', args.compress) + fsw.property('data', data) + fsw.property_u32('load', 0) + fsw.property_u32('entry', 0) + + +def finish_fit(fsw, entries): + """Finish the FIT ready for use + + Writes the /configurations node and subnodes + + Args: + fsw (libfdt.FdtSw): Object to use for writing + entries (list of tuple): List of configurations: + str: Description of model + str: Compatible stringlist + """ + fsw.end_node() + seq = 0 + with fsw.add_node('configurations'): + for model, compat in entries: + seq += 1 + with fsw.add_node(f'conf-{seq}'): + fsw.property('compatible', bytes(compat)) + fsw.property_string('description', model) + fsw.property_string('fdt', f'fdt-{seq}') + fsw.property_string('kernel', 'kernel') + fsw.end_node() + + +def compress_data(inf, compress): + """Compress data using a selected algorithm + + Args: + inf (IOBase): Filename containing the data to compress + compress (str): Compression algorithm, e.g. 'gzip' + + Return: + bytes: Compressed data + """ + if compress == 'none': + return inf.read() + + comp = COMP_TOOLS.get(compress) + if not comp: + raise ValueError(f"Unknown compression algorithm '{compress}'") + + with tempfile.NamedTemporaryFile() as comp_fname: + with open(comp_fname.name, 'wb') as outf: + done = False + for tool in comp.tools.split(','): + try: + subprocess.call([tool, '-c'], stdin=inf, stdout=outf) + done = True + break + except FileNotFoundError: + pass + if not done: + raise ValueError(f'Missing tool(s): {comp.tools}\n') + with open(comp_fname.name, 'rb') as compf: + comp_data = compf.read() + return comp_data + + +def output_dtb(fsw, seq, fname, arch, compress): + """Write out a single devicetree to the FIT + + Args: + fsw (libfdt.FdtSw): Object to use for writing + seq (int): Sequence number (1 for first) + fmame (str): Filename containing the DTB + arch: FIT architecture, e.g. 'arm64' + compress (str): Compressed algorithm, e.g. 'gzip' + + Returns: + tuple: + str: Model name + bytes: Compatible stringlist + """ + with fsw.add_node(f'fdt-{seq}'): + # Get the compatible / model information + with open(fname, 'rb') as inf: + data = inf.read() + fdt = libfdt.FdtRo(data) + model = fdt.getprop(0, 'model').as_str() + compat = fdt.getprop(0, 'compatible') + + fsw.property_string('description', model) + fsw.property_string('type', 'flat_dt') + fsw.property_string('arch', arch) + fsw.property_string('compression', compress) + fsw.property('compatible', bytes(compat)) + + with open(fname, 'rb') as inf: + compressed = compress_data(inf, compress) + fsw.property('data', compressed) + return model, compat + + +def build_fit(args): + """Build the FIT from the provided files and arguments + + Args: + args (Namespace): Program arguments + + Returns: + tuple: + bytes: FIT data + int: Number of configurations generated + size: Total uncompressed size of data + """ + seq = 0 + size = 0 + fsw = libfdt.FdtSw() + setup_fit(fsw, args.name) + entries = [] + + # Handle the kernel + with open(args.kernel, 'rb') as inf: + comp_data = compress_data(inf, args.compress) + size += os.path.getsize(args.kernel) + write_kernel(fsw, comp_data, args) + + for fname in args.dtbs: + # Ignore overlay (.dtbo) files + if os.path.splitext(fname)[1] == '.dtb': + seq += 1 + size += os.path.getsize(fname) + model, compat = output_dtb(fsw, seq, fname, args.arch, args.compress) + entries.append([model, compat]) + + finish_fit(fsw, entries) + + # Include the kernel itself in the returned file count + return fsw.as_fdt().as_bytearray(), seq + 1, size + + +def run_make_fit(): + """Run the tool's main logic""" + args = parse_args() + + out_data, count, size = build_fit(args) + with open(args.output, 'wb') as outf: + outf.write(out_data) + + ext_fit_size = None + if args.external: + mkimage = os.environ.get('MKIMAGE', 'mkimage') + subprocess.check_call([mkimage, '-E', '-F', args.output], + stdout=subprocess.DEVNULL) + + with open(args.output, 'rb') as inf: + data = inf.read() + ext_fit = libfdt.FdtRo(data) + ext_fit_size = ext_fit.totalsize() + + if args.verbose: + comp_size = len(out_data) + print(f'FIT size {comp_size:#x}/{comp_size / 1024 / 1024:.1f} MB', + end='') + if ext_fit_size: + print(f', header {ext_fit_size:#x}/{ext_fit_size / 1024:.1f} KB', + end='') + print(f', {count} files, uncompressed {size / 1024 / 1024:.1f} MB') + + +if __name__ == "__main__": + sys.exit(run_make_fit()) From 5c63db59c5f89925add57642be4f789d0d671ccd Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 12 Apr 2024 14:19:06 +0100 Subject: [PATCH 20/64] arm64: mm: Don't remap pgtables per-cont(pte|pmd) block A large part of the kernel boot time is creating the kernel linear map page tables. When rodata=full, all memory is mapped by pte. And when there is lots of physical ram, there are lots of pte tables to populate. The primary cost associated with this is mapping and unmapping the pte table memory in the fixmap; at unmap time, the TLB entry must be invalidated and this is expensive. Previously, each pmd and pte table was fixmapped/fixunmapped for each cont(pte|pmd) block of mappings (16 entries with 4K granule). This means we ended up issuing 32 TLBIs per (pmd|pte) table during the population phase. Let's fix that, and fixmap/fixunmap each page once per population, for a saving of 31 TLBIs per (pmd|pte) table. This gives a significant boot speedup. Execution time of map_mem(), which creates the kernel linear map page tables, was measured on different machines with different RAM configs: | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra | VM, 16G | VM, 64G | VM, 256G | Metal, 512G ---------------|-------------|-------------|-------------|------------- | ms (%) | ms (%) | ms (%) | ms (%) ---------------|-------------|-------------|-------------|------------- before | 168 (0%) | 2198 (0%) | 8644 (0%) | 17447 (0%) after | 78 (-53%) | 435 (-80%) | 1723 (-80%) | 3779 (-78%) Signed-off-by: Ryan Roberts Tested-by: Itaru Kitayama Tested-by: Eric Chanudet Reviewed-by: Mark Rutland Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240412131908.433043-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 495b732d5af3..9f1d69b7b494 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -172,12 +172,9 @@ bool pgattr_change_is_safe(u64 old, u64 new) return ((old ^ new) & ~mask) == 0; } -static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, +static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot) { - pte_t *ptep; - - ptep = pte_set_fixmap_offset(pmdp, addr); do { pte_t old_pte = __ptep_get(ptep); @@ -192,8 +189,6 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys += PAGE_SIZE; } while (ptep++, addr += PAGE_SIZE, addr != end); - - pte_clear_fixmap(); } static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, @@ -204,6 +199,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, { unsigned long next; pmd_t pmd = READ_ONCE(*pmdp); + pte_t *ptep; BUG_ON(pmd_sect(pmd)); if (pmd_none(pmd)) { @@ -219,6 +215,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, } BUG_ON(pmd_bad(pmd)); + ptep = pte_set_fixmap_offset(pmdp, addr); do { pgprot_t __prot = prot; @@ -229,20 +226,21 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pte(pmdp, addr, next, phys, __prot); + init_pte(ptep, addr, next, phys, __prot); + ptep += pte_index(next) - pte_index(addr); phys += next - addr; } while (addr = next, addr != end); + + pte_clear_fixmap(); } -static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, +static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; - pmd_t *pmdp; - pmdp = pmd_set_fixmap_offset(pudp, addr); do { pmd_t old_pmd = READ_ONCE(*pmdp); @@ -268,8 +266,6 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, } phys += next - addr; } while (pmdp++, addr = next, addr != end); - - pmd_clear_fixmap(); } static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, @@ -279,6 +275,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, { unsigned long next; pud_t pud = READ_ONCE(*pudp); + pmd_t *pmdp; /* * Check for initial section mappings in the pgd/pud. @@ -297,6 +294,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, } BUG_ON(pud_bad(pud)); + pmdp = pmd_set_fixmap_offset(pudp, addr); do { pgprot_t __prot = prot; @@ -307,10 +305,13 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags); + init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); + pmdp += pmd_index(next) - pmd_index(addr); phys += next - addr; } while (addr = next, addr != end); + + pmd_clear_fixmap(); } static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, From 1fcb7cea8a5f7747e02230f816c2c80b060d9517 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 12 Apr 2024 14:19:07 +0100 Subject: [PATCH 21/64] arm64: mm: Batch dsb and isb when populating pgtables After removing uneccessary TLBIs, the next bottleneck when creating the page tables for the linear map is DSB and ISB, which were previously issued per-pte in __set_pte(). Since we are writing multiple ptes in a given pte table, we can elide these barriers and insert them once we have finished writing to the table. Execution time of map_mem(), which creates the kernel linear map page tables, was measured on different machines with different RAM configs: | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra | VM, 16G | VM, 64G | VM, 256G | Metal, 512G ---------------|-------------|-------------|-------------|------------- | ms (%) | ms (%) | ms (%) | ms (%) ---------------|-------------|-------------|-------------|------------- before | 78 (0%) | 435 (0%) | 1723 (0%) | 3779 (0%) after | 11 (-86%) | 161 (-63%) | 656 (-62%) | 1654 (-56%) Signed-off-by: Ryan Roberts Tested-by: Itaru Kitayama Tested-by: Eric Chanudet Reviewed-by: Mark Rutland Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240412131908.433043-3-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 7 ++++++- arch/arm64/mm/mmu.c | 11 ++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index afdd56d26ad7..105a95a8845c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -271,9 +271,14 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } -static inline void __set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte_nosync(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); +} + +static inline void __set_pte(pte_t *ptep, pte_t pte) +{ + __set_pte_nosync(ptep, pte); /* * Only if the new pte is valid and kernel, otherwise TLB maintenance diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9f1d69b7b494..ac88b89770a6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -178,7 +178,11 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, do { pte_t old_pte = __ptep_get(ptep); - __set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); + /* + * Required barriers to make this visible to the table walker + * are deferred to the end of alloc_init_cont_pte(). + */ + __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we @@ -232,6 +236,11 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, phys += next - addr; } while (addr = next, addr != end); + /* + * Note: barriers and maintenance necessary to clear the fixmap slot + * ensure that all previous pgtable writes are visible to the table + * walker. + */ pte_clear_fixmap(); } From 0e9df1c905d8293d333ace86c13d147382f5caf9 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 12 Apr 2024 14:19:08 +0100 Subject: [PATCH 22/64] arm64: mm: Don't remap pgtables for allocate vs populate During linear map pgtable creation, each pgtable is fixmapped / fixunmapped twice; once during allocation to zero the memory, and a again during population to write the entries. This means each table has 2 TLB invalidations issued against it. Let's fix this so that each table is only fixmapped/fixunmapped once, halving the number of TLBIs, and improving performance. Achieve this by separating allocation and initialization (zeroing) of the page. The allocated page is now fixmapped directly by the walker and initialized, before being populated and finally fixunmapped. This approach keeps the change small, but has the side effect that late allocations (using __get_free_page()) must also go through the generic memory clearing routine. So let's tell __get_free_page() not to zero the memory to avoid duplication. Additionally this approach means that fixmap/fixunmap is still used for late pgtable modifications. That's not technically needed since the memory is all mapped in the linear map by that point. That's left as a possible future optimization if found to be needed. Execution time of map_mem(), which creates the kernel linear map page tables, was measured on different machines with different RAM configs: | Apple M2 VM | Ampere Altra| Ampere Altra| Ampere Altra | VM, 16G | VM, 64G | VM, 256G | Metal, 512G ---------------|-------------|-------------|-------------|------------- | ms (%) | ms (%) | ms (%) | ms (%) ---------------|-------------|-------------|-------------|------------- before | 11 (0%) | 161 (0%) | 656 (0%) | 1654 (0%) after | 10 (-11%) | 104 (-35%) | 438 (-33%) | 1223 (-26%) Signed-off-by: Ryan Roberts Suggested-by: Mark Rutland Tested-by: Itaru Kitayama Tested-by: Eric Chanudet Reviewed-by: Mark Rutland Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20240412131908.433043-4-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 2 + arch/arm64/mm/mmu.c | 67 +++++++++++++++++--------------- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 105a95a8845c..92c9aed5e7af 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1010,6 +1010,8 @@ static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr) static inline bool pgtable_l5_enabled(void) { return false; } +#define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1)) + /* Match p4d_offset folding in */ #define p4d_set_fixmap(addr) NULL #define p4d_set_fixmap_offset(p4dp, addr) ((p4d_t *)p4dp) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ac88b89770a6..c927e9312f10 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -109,28 +109,12 @@ EXPORT_SYMBOL(phys_mem_access_prot); static phys_addr_t __init early_pgtable_alloc(int shift) { phys_addr_t phys; - void *ptr; phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, MEMBLOCK_ALLOC_NOLEAKTRACE); if (!phys) panic("Failed to allocate page table page\n"); - /* - * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE - * slot will be free, so we can (ab)use the FIX_PTE slot to initialise - * any level of table. - */ - ptr = pte_set_fixmap(phys); - - memset(ptr, 0, PAGE_SIZE); - - /* - * Implicit barriers also ensure the zeroed page is visible to the page - * table walker - */ - pte_clear_fixmap(); - return phys; } @@ -172,6 +156,14 @@ bool pgattr_change_is_safe(u64 old, u64 new) return ((old ^ new) & ~mask) == 0; } +static void init_clear_pgtable(void *table) +{ + clear_page(table); + + /* Ensure the zeroing is observed by page table walks. */ + dsb(ishst); +} + static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot) { @@ -214,12 +206,15 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(PAGE_SHIFT); + ptep = pte_set_fixmap(pte_phys); + init_clear_pgtable(ptep); + ptep += pte_index(addr); __pmd_populate(pmdp, pte_phys, pmdval); - pmd = READ_ONCE(*pmdp); + } else { + BUG_ON(pmd_bad(pmd)); + ptep = pte_set_fixmap_offset(pmdp, addr); } - BUG_ON(pmd_bad(pmd)); - ptep = pte_set_fixmap_offset(pmdp, addr); do { pgprot_t __prot = prot; @@ -298,12 +293,15 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(PMD_SHIFT); + pmdp = pmd_set_fixmap(pmd_phys); + init_clear_pgtable(pmdp); + pmdp += pmd_index(addr); __pud_populate(pudp, pmd_phys, pudval); - pud = READ_ONCE(*pudp); + } else { + BUG_ON(pud_bad(pud)); + pmdp = pmd_set_fixmap_offset(pudp, addr); } - BUG_ON(pud_bad(pud)); - pmdp = pmd_set_fixmap_offset(pudp, addr); do { pgprot_t __prot = prot; @@ -340,12 +338,15 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(PUD_SHIFT); + pudp = pud_set_fixmap(pud_phys); + init_clear_pgtable(pudp); + pudp += pud_index(addr); __p4d_populate(p4dp, pud_phys, p4dval); - p4d = READ_ONCE(*p4dp); + } else { + BUG_ON(p4d_bad(p4d)); + pudp = pud_set_fixmap_offset(p4dp, addr); } - BUG_ON(p4d_bad(p4d)); - pudp = pud_set_fixmap_offset(p4dp, addr); do { pud_t old_pud = READ_ONCE(*pudp); @@ -395,12 +396,15 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); p4d_phys = pgtable_alloc(P4D_SHIFT); + p4dp = p4d_set_fixmap(p4d_phys); + init_clear_pgtable(p4dp); + p4dp += p4d_index(addr); __pgd_populate(pgdp, p4d_phys, pgdval); - pgd = READ_ONCE(*pgdp); + } else { + BUG_ON(pgd_bad(pgd)); + p4dp = p4d_set_fixmap_offset(pgdp, addr); } - BUG_ON(pgd_bad(pgd)); - p4dp = p4d_set_fixmap_offset(pgdp, addr); do { p4d_t old_p4d = READ_ONCE(*p4dp); @@ -467,11 +471,10 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, static phys_addr_t __pgd_pgtable_alloc(int shift) { - void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL); - BUG_ON(!ptr); + /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ + void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); - /* Ensure the zeroed page is visible to the page table walker */ - dsb(ishst); + BUG_ON(!ptr); return __pa(ptr); } From bc5b492ac305e0d1a5b05cd55db2274987449d02 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 11 Mar 2024 12:19:14 +0000 Subject: [PATCH 23/64] ACPICA: Detect FACS even for hardware reduced platforms ACPICA commit 44fc328a1a14b097d92b8be83989e4bf69b6e6cb The FACS is optional even on hardware reduced platforms, and may exist for the purpose of communicating the hardware_signature field to provoke a clean reboot instead of a resume from hibernation. Signed-off-by: David Woodhouse Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20240412073530.2222496-2-dwmw2@infradead.org Signed-off-by: Will Deacon --- drivers/acpi/acpica/tbfadt.c | 30 +++++++++++++----------------- drivers/acpi/acpica/tbutils.c | 7 +------ 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 44267a92bce5..3c126c6d306b 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -315,23 +315,19 @@ void acpi_tb_parse_fadt(void) ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, NULL, FALSE, TRUE, &acpi_gbl_dsdt_index); - /* If Hardware Reduced flag is set, there is no FACS */ - - if (!acpi_gbl_reduced_hardware) { - if (acpi_gbl_FADT.facs) { - acpi_tb_install_standard_table((acpi_physical_address) - acpi_gbl_FADT.facs, - ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, - NULL, FALSE, TRUE, - &acpi_gbl_facs_index); - } - if (acpi_gbl_FADT.Xfacs) { - acpi_tb_install_standard_table((acpi_physical_address) - acpi_gbl_FADT.Xfacs, - ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, - NULL, FALSE, TRUE, - &acpi_gbl_xfacs_index); - } + if (acpi_gbl_FADT.facs) { + acpi_tb_install_standard_table((acpi_physical_address) + acpi_gbl_FADT.facs, + ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, + NULL, FALSE, TRUE, + &acpi_gbl_facs_index); + } + if (acpi_gbl_FADT.Xfacs) { + acpi_tb_install_standard_table((acpi_physical_address) + acpi_gbl_FADT.Xfacs, + ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL, + NULL, FALSE, TRUE, + &acpi_gbl_xfacs_index); } } diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index bb4a56e5673a..15fa68a5ea6e 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -36,12 +36,7 @@ acpi_status acpi_tb_initialize_facs(void) { struct acpi_table_facs *facs; - /* If Hardware Reduced flag is set, there is no FACS */ - - if (acpi_gbl_reduced_hardware) { - acpi_gbl_FACS = NULL; - return (AE_OK); - } else if (acpi_gbl_FADT.Xfacs && + if (acpi_gbl_FADT.Xfacs && (!acpi_gbl_FADT.facs || !acpi_gbl_use32_bit_facs_addresses)) { (void)acpi_get_table_by_index(acpi_gbl_xfacs_index, From fbaad243b5368f19c1e96dc9d914eefaebcb6ecc Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 11 Mar 2024 13:04:07 +0000 Subject: [PATCH 24/64] arm64: acpi: Honour firmware_signature field of FACS, if it exists If the firmware_signature changes then OSPM should not attempt to resume from hibernate, but should instead perform a clean reboot. Set the global swsusp_hardware_signature to allow the generic code to include the value in the swsusp header on disk, and perform the appropriate check on resume. Signed-off-by: David Woodhouse Acked-by: Sudeep Holla Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20240412073530.2222496-3-dwmw2@infradead.org Signed-off-by: Will Deacon --- arch/arm64/kernel/acpi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index dba8fcec7f33..e0e7b93c16cc 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -227,6 +228,15 @@ void __init acpi_boot_table_init(void) if (earlycon_acpi_spcr_enable) early_init_dt_scan_chosen_stdout(); } else { +#ifdef CONFIG_HIBERNATION + struct acpi_table_header *facs = NULL; + acpi_get_table(ACPI_SIG_FACS, 1, &facs); + if (facs) { + swsusp_hardware_signature = + ((struct acpi_table_facs *)facs)->hardware_signature; + acpi_put_table(facs); + } +#endif acpi_parse_spcr(earlycon_acpi_spcr_enable, true); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); From f4d9d9dcc70b96b5e5d7801bd5fbf8491b07b13d Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 9 Jan 2024 13:23:08 -0600 Subject: [PATCH 25/64] arm64: Add Neoverse-V2 part Add the part number and MIDR for Neoverse-V2 Signed-off-by: Besar Wicaksono Reviewed-by: James Clark Link: https://lore.kernel.org/r/20240109192310.16234-2-bwicaksono@nvidia.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 52f076afeb96..936389e9aecb 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -86,6 +86,7 @@ #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 #define ARM_CPU_PART_CORTEX_A78C 0xD4B +#define ARM_CPU_PART_NEOVERSE_V2 0xD4F #define APM_CPU_PART_XGENE 0x000 #define APM_CPU_VAR_POTENZA 0x00 @@ -159,6 +160,7 @@ #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) #define MIDR_CORTEX_A78C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78C) +#define MIDR_NEOVERSE_V2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_V2) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) From 1fb8950417a4c73c33dd827b816ee41c8cf0c26a Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:28 +0100 Subject: [PATCH 26/64] perf/hisi-pcie: Assign parent for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the PCI device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Yicong Yang Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-2-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_pcie_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c index 06b192cc31d5..b1e4739fbdb0 100644 --- a/drivers/perf/hisilicon/hisi_pcie_pmu.c +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -806,6 +806,7 @@ static int hisi_pcie_alloc_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_ pcie_pmu->pmu = (struct pmu) { .name = name, .module = THIS_MODULE, + .parent = &pdev->dev, .event_init = hisi_pcie_pmu_event_init, .pmu_enable = hisi_pcie_pmu_enable, .pmu_disable = hisi_pcie_pmu_disable, From d0412b6ecb4e422dcb1754106c52e40946a95b61 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:29 +0100 Subject: [PATCH 27/64] Documentation: hisi-pmu: Drop reference to /sys/devices path Having assigned a parent to the device, the suggested path is no longer valid. As /sys/bus/event_sources based path is also provided, simply drop mention of alternative. Reviewed-by: Yicong Yang Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-3-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hisi-pmu.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst index e0174d20809a..5cc248d18c63 100644 --- a/Documentation/admin-guide/perf/hisi-pmu.rst +++ b/Documentation/admin-guide/perf/hisi-pmu.rst @@ -20,7 +20,6 @@ interrupt, and the PMU driver shall register perf PMU drivers like L3C, HHA and DDRC etc. The available events and configuration options shall be described in the sysfs, see: -/sys/devices/hisi_sccl{X}_/, or /sys/bus/event_source/devices/hisi_sccl{X}_. The "perf list" command shall list the available events from sysfs. From 16d417f6c45b1854e29fc5d6de722de0bdc6ccdf Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:30 +0100 Subject: [PATCH 28/64] perf/hisi-uncore: Assign parents for event_source devices Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Greg Kroah-Hartman Reviewed-by: Yicong Yang Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-4-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index ccc9191ad1b6..a60e4c966098 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -536,6 +536,7 @@ void hisi_pmu_init(struct hisi_pmu *hisi_pmu, struct module *module) struct pmu *pmu = &hisi_pmu->pmu; pmu->module = module; + pmu->parent = hisi_pmu->dev; pmu->task_ctx_nr = perf_invalid_context; pmu->event_init = hisi_uncore_pmu_event_init; pmu->pmu_enable = hisi_uncore_pmu_enable; From eff6af531335341d7a988ea4cacd4e5938a2321a Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:31 +0100 Subject: [PATCH 29/64] Documentation: hns-pmu: Use /sys/bus/event_source/devices paths To allow setting an appropriate parent for the struct pmu device remove existing references to /sys/devices/ path. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-5-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/hns3-pmu.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/perf/hns3-pmu.rst b/Documentation/admin-guide/perf/hns3-pmu.rst index 75a40846d47f..1195e570f2d6 100644 --- a/Documentation/admin-guide/perf/hns3-pmu.rst +++ b/Documentation/admin-guide/perf/hns3-pmu.rst @@ -16,7 +16,7 @@ HNS3 PMU driver The HNS3 PMU driver registers a perf PMU with the name of its sicl id.:: - /sys/devices/hns3_pmu_sicl_ + /sys/bus/event_source/devices/hns3_pmu_sicl_ PMU driver provides description of available events, filter modes, format, identifier and cpumask in sysfs. @@ -40,9 +40,9 @@ device. Example usage of checking event code and subevent code:: - $# cat /sys/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_time + $# cat /sys/bus/event_source/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_time config=0x00204 - $# cat /sys/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_packet_num + $# cat /sys/bus/event_source/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_packet_num config=0x10204 Each performance statistic has a pair of events to get two values to @@ -60,7 +60,7 @@ computation to calculate real performance data is::: Example usage of checking supported filter mode:: - $# cat /sys/devices/hns3_pmu_sicl_0/filtermode/bw_ssu_rpu_byte_num + $# cat /sys/bus/event_source/devices/hns3_pmu_sicl_0/filtermode/bw_ssu_rpu_byte_num filter mode supported: global/port/port-tc/func/func-queue/ Example usage of perf:: From 3d957de12c65a8757a8007735ed2a303e0b365a9 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:32 +0100 Subject: [PATCH 30/64] perf/hisi-hns3: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the PCI device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-6-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hns3_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/hisilicon/hns3_pmu.c b/drivers/perf/hisilicon/hns3_pmu.c index 16869bf5bf4c..5236acdcc2e1 100644 --- a/drivers/perf/hisilicon/hns3_pmu.c +++ b/drivers/perf/hisilicon/hns3_pmu.c @@ -1419,6 +1419,7 @@ static int hns3_pmu_alloc_pmu(struct pci_dev *pdev, struct hns3_pmu *hns3_pmu) hns3_pmu->pmu = (struct pmu) { .name = name, .module = THIS_MODULE, + .parent = &pdev->dev, .event_init = hns3_pmu_event_init, .pmu_enable = hns3_pmu_enable, .pmu_disable = hns3_pmu_disable, From 1b7718fcc3f20f89931a4cf6a4fde2546cf143bd Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:33 +0100 Subject: [PATCH 31/64] perf/amlogic: Assign parents for event_source devices Currently all these devices appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parents to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Jiucheng Xu Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-7-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/amlogic/meson_ddr_pmu_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/amlogic/meson_ddr_pmu_core.c b/drivers/perf/amlogic/meson_ddr_pmu_core.c index bbc7285fd934..07446d784a1a 100644 --- a/drivers/perf/amlogic/meson_ddr_pmu_core.c +++ b/drivers/perf/amlogic/meson_ddr_pmu_core.c @@ -492,6 +492,7 @@ int meson_ddr_pmu_create(struct platform_device *pdev) *pmu = (struct ddr_pmu) { .pmu = { .module = THIS_MODULE, + .parent = &pdev->dev, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .task_ctx_nr = perf_invalid_context, .attr_groups = attr_groups, From 3a1bb75ebc1b8965c7f7dbb86584143feda8e83b Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:34 +0100 Subject: [PATCH 32/64] perf/arm_cspmu: Assign parents for event_source devices Currently all these devices appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parents to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Suzuki K Poulose Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-8-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_cspmu/arm_cspmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index fd1004251665..ba0cf2f466ef 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -1206,6 +1206,7 @@ static int arm_cspmu_register_pmu(struct arm_cspmu *cspmu) cspmu->pmu = (struct pmu){ .task_ctx_nr = perf_invalid_context, .module = cspmu->impl.module, + .parent = cspmu->dev, .pmu_enable = arm_cspmu_enable, .pmu_disable = arm_cspmu_disable, .event_init = arm_cspmu_event_init, From 867ba6d204f1c35c5d615c9b9c62f51a699220fa Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:35 +0100 Subject: [PATCH 33/64] Documentation: xgene-pmu: Use /sys/bus/event_source/devices paths To allow setting an appropriate parent for the struct pmu device remove existing references to /sys/devices/ path. Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-9-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/xgene-pmu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/perf/xgene-pmu.rst b/Documentation/admin-guide/perf/xgene-pmu.rst index 644f8ed89152..98ccb8e777c4 100644 --- a/Documentation/admin-guide/perf/xgene-pmu.rst +++ b/Documentation/admin-guide/perf/xgene-pmu.rst @@ -13,7 +13,7 @@ PMU (perf) driver The xgene-pmu driver registers several perf PMU drivers. Each of the perf driver provides description of its available events and configuration options -in sysfs, see /sys/devices//. +in sysfs, see /sys/bus/event_source/devices//. The "format" directory describes format of the config (event ID), config1 (agent ID) fields of the perf_event_attr structure. The "events" From 89e34f8bee6cdd4137966321a5b5573412079116 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:36 +0100 Subject: [PATCH 34/64] perf/xgene: Assign parents for event_source devices Currently all these devices appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parents to be the hardware related struct device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Cc: Khuong Dinh Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-10-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/xgene_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c index 0d49343d704b..8823b4c6b556 100644 --- a/drivers/perf/xgene_pmu.c +++ b/drivers/perf/xgene_pmu.c @@ -1102,6 +1102,7 @@ static int xgene_init_perf(struct xgene_pmu_dev *pmu_dev, char *name) /* Perf driver registration */ pmu_dev->pmu = (struct pmu) { + .parent = pmu_dev->parent->dev, .attr_groups = pmu_dev->attr_groups, .task_ctx_nr = perf_invalid_context, .pmu_enable = xgene_perf_pmu_enable, From 90b4a1a927ee03972b70c69efbf7642654c79902 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:37 +0100 Subject: [PATCH 35/64] Documentation: thunderx2-pmu: Use /sys/bus/event_source/devices paths To allow setting an appropriate parent for the struct pmu device remove existing references to /sys/devices/ path. Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-11-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/thunderx2-pmu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/perf/thunderx2-pmu.rst b/Documentation/admin-guide/perf/thunderx2-pmu.rst index 01f158238ae1..9255f7bf9452 100644 --- a/Documentation/admin-guide/perf/thunderx2-pmu.rst +++ b/Documentation/admin-guide/perf/thunderx2-pmu.rst @@ -22,7 +22,7 @@ The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and L3C devices. Each PMU can be used to count up to 4 (DMC/L3C) or up to 8 (CCPI2) events simultaneously. The PMUs provide a description of their available events and configuration options under sysfs, see -/sys/devices/uncore_; S is the socket id. +/sys/bus/event_source/devices/uncore_; S is the socket id. The driver does not support sampling, therefore "perf record" will not work. Per-task perf sessions are also not supported. From ecb79c21c18943487b4e16ae2e5fe60e8f59e08a Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:38 +0100 Subject: [PATCH 36/64] perf/thunderx2: Assign parents for event_source devices Currently all these devices appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parents to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Cc: Robert Richter Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-12-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/thunderx2_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index 33e8ff3e5265..faf763d2c95c 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -724,6 +724,7 @@ static int tx2_uncore_pmu_register( /* Perf event registration */ tx2_pmu->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = tx2_pmu->dev, .attr_groups = tx2_pmu->attr_groups, .task_ctx_nr = perf_invalid_context, .event_init = tx2_uncore_event_init, From 50650e5f3186dcd3cc291b515022eab281256688 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:39 +0100 Subject: [PATCH 37/64] perf/riscv: Assign parents for event_source devices Currently all these devices appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parents to be the appropriate platform devices. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Cc: Atish Patra CC: Anup Patel Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-13-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/riscv_pmu_legacy.c | 1 + drivers/perf/riscv_pmu_sbi.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/perf/riscv_pmu_legacy.c b/drivers/perf/riscv_pmu_legacy.c index fa0bccf4edf2..04487ad7fba0 100644 --- a/drivers/perf/riscv_pmu_legacy.c +++ b/drivers/perf/riscv_pmu_legacy.c @@ -136,6 +136,7 @@ static int pmu_legacy_device_probe(struct platform_device *pdev) pmu = riscv_pmu_alloc(); if (!pmu) return -ENOMEM; + pmu->pmu.parent = &pdev->dev; pmu_legacy_init(pmu); return 0; diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 5aef5a8737b2..82636273d726 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1080,6 +1080,7 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) } pmu->pmu.attr_groups = riscv_pmu_attr_groups; + pmu->pmu.parent = &pdev->dev; pmu->cmask = cmask; pmu->ctr_start = pmu_sbi_ctr_start; pmu->ctr_stop = pmu_sbi_ctr_stop; From 556da13434521abd912bcb810ef871cf0f3555e8 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:40 +0100 Subject: [PATCH 38/64] Documentation: qcom-pmu: Use /sys/bus/event_source/devices paths To allow setting an appropriate parent for the struct pmu device remove existing references to /sys/devices/ path. Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-14-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/qcom_l2_pmu.rst | 2 +- Documentation/admin-guide/perf/qcom_l3_pmu.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/perf/qcom_l2_pmu.rst b/Documentation/admin-guide/perf/qcom_l2_pmu.rst index c130178a4a55..c37c6be9b8d8 100644 --- a/Documentation/admin-guide/perf/qcom_l2_pmu.rst +++ b/Documentation/admin-guide/perf/qcom_l2_pmu.rst @@ -10,7 +10,7 @@ There is one logical L2 PMU exposed, which aggregates the results from the physical PMUs. The driver provides a description of its available events and configuration -options in sysfs, see /sys/devices/l2cache_0. +options in sysfs, see /sys/bus/event_source/devices/l2cache_0. The "format" directory describes the format of the events. diff --git a/Documentation/admin-guide/perf/qcom_l3_pmu.rst b/Documentation/admin-guide/perf/qcom_l3_pmu.rst index a3d014a46bfd..a66556b7e985 100644 --- a/Documentation/admin-guide/perf/qcom_l3_pmu.rst +++ b/Documentation/admin-guide/perf/qcom_l3_pmu.rst @@ -9,7 +9,7 @@ PMU with device name l3cache__. User space is responsible for aggregating across slices. The driver provides a description of its available events and configuration -options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs +options in sysfs, see /sys/bus/event_source/devices/l3cache*. Given that these are uncore PMUs the driver also exposes a "cpumask" sysfs attribute which contains a mask consisting of one CPU per socket which will be used to handle all the PMU events on that socket. From 6148865dd57cce4acae81909892d5a9fe16ecce7 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:41 +0100 Subject: [PATCH 39/64] perf/qcom: Assign parents for event_source devices Currently all these devices appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parents to be the platform devices. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Cc: Andy Gross Cc: Bjorn Andersson Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-15-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/qcom_l2_pmu.c | 1 + drivers/perf/qcom_l3_pmu.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index b5a44dc1dc3a..980e3051edd7 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -902,6 +902,7 @@ static int l2_cache_pmu_probe(struct platform_device *pdev) l2cache_pmu->pmu = (struct pmu) { /* suffix is instance id for future use with multiple sockets */ .name = "l2cache_0", + .parent = &pdev->dev, .task_ctx_nr = perf_invalid_context, .pmu_enable = l2_cache_pmu_enable, .pmu_disable = l2_cache_pmu_disable, diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c index f16783d03db7..37786e88514e 100644 --- a/drivers/perf/qcom_l3_pmu.c +++ b/drivers/perf/qcom_l3_pmu.c @@ -748,6 +748,7 @@ static int qcom_l3_cache_pmu_probe(struct platform_device *pdev) return -ENOMEM; l3pmu->pmu = (struct pmu) { + .parent = &pdev->dev, .task_ctx_nr = perf_invalid_context, .pmu_enable = qcom_l3_cache__pmu_enable, From 1d194ab8571beb7363519d5c5b050a0cbd59b664 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:42 +0100 Subject: [PATCH 40/64] perf/imx_ddr: Assign parents for event_source devices Currently all this device appear directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Cc: Frank Li Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-16-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/fsl_imx8_ddr_perf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index 4e8fa5a48fcf..1bbdb29743c4 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -651,6 +651,7 @@ static int ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base, *pmu = (struct ddr_pmu) { .pmu = (struct pmu) { .module = THIS_MODULE, + .parent = dev, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .task_ctx_nr = perf_invalid_context, .attr_groups = attr_groups, From 7bf75431a9ba1b3c2ededbcf08dca023786337fc Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:43 +0100 Subject: [PATCH 41/64] perf/arm_pmu: Assign parents for event_source devices Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Signed-off-by: Jonathan Cameron Reviewed-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240412161057.14099-17-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmu_platform.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c index 3596db36cbff..4b1a9a92ea11 100644 --- a/drivers/perf/arm_pmu_platform.c +++ b/drivers/perf/arm_pmu_platform.c @@ -196,6 +196,7 @@ int arm_pmu_device_probe(struct platform_device *pdev, if (!pmu) return -ENOMEM; + pmu->pmu.parent = &pdev->dev; pmu->plat_device = pdev; ret = pmu_parse_irqs(pmu); From 1919bd8e0be0bdd0382ee672a40e75bf19c0068c Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:44 +0100 Subject: [PATCH 42/64] perf/alibaba_uncore: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Shuai Xue Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-18-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/alibaba_uncore_drw_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c index d4d14b65c4a5..89dd38343f93 100644 --- a/drivers/perf/alibaba_uncore_drw_pmu.c +++ b/drivers/perf/alibaba_uncore_drw_pmu.c @@ -709,6 +709,7 @@ static int ali_drw_pmu_probe(struct platform_device *pdev) drw_pmu->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = &pdev->dev, .task_ctx_nr = perf_invalid_context, .event_init = ali_drw_pmu_event_init, .add = ali_drw_pmu_add, From e7ec4791f903d65548519a9ceeaec4f44a591655 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:45 +0100 Subject: [PATCH 43/64] perf/arm-cci: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Suzuki K Poulose Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-19-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm-cci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c index 6be03f81ae5d..a7fd80677919 100644 --- a/drivers/perf/arm-cci.c +++ b/drivers/perf/arm-cci.c @@ -1409,6 +1409,7 @@ static int cci_pmu_init(struct cci_pmu *cci_pmu, struct platform_device *pdev) cci_pmu->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = &pdev->dev, .name = cci_pmu->model->name, .task_ctx_nr = perf_invalid_context, .pmu_enable = cci_pmu_enable, From f4144be05a606371d7258b618e383f1276e3c207 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:46 +0100 Subject: [PATCH 44/64] perf/arm-ccn: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Acked-by: Suzuki K Poulose Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-20-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm-ccn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index 641471bd5eff..f4495ff6525f 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -1265,6 +1265,7 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn) /* Perf driver registration */ ccn->dt.pmu = (struct pmu) { .module = THIS_MODULE, + .parent = ccn->dev, .attr_groups = arm_ccn_pmu_attr_groups, .task_ctx_nr = perf_invalid_context, .event_init = arm_ccn_pmu_event_init, From 46bed4c740d5de9e2ac62b4cfc20461da463f71b Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:47 +0100 Subject: [PATCH 45/64] perf/arm-dmc620: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-21-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_dmc620_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 8a81be2dd5ec..2ec96e204c40 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -673,6 +673,7 @@ static int dmc620_pmu_device_probe(struct platform_device *pdev) dmc620_pmu->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = &pdev->dev, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .task_ctx_nr = perf_invalid_context, .event_init = dmc620_pmu_event_init, From bc81ae2efbb3f9fd12322787f475b77f51ca2a15 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:48 +0100 Subject: [PATCH 46/64] perf/arm-dsu: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Reviewed-by: Suzuki K Poulose Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-22-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index adc0bbb5fafe..92248a24a1aa 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -742,6 +742,7 @@ static int dsu_pmu_device_probe(struct platform_device *pdev) dsu_pmu->pmu = (struct pmu) { .task_ctx_nr = perf_invalid_context, + .parent = &pdev->dev, .module = THIS_MODULE, .pmu_enable = dsu_pmu_enable, .pmu_disable = dsu_pmu_disable, From a8889fbf16bc954b8d11f16410963feee6cd9980 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:49 +0100 Subject: [PATCH 47/64] perf/arm-smmuv3: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-23-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_smmuv3_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index 719aa953a1c4..d5fa92ba8373 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -860,6 +860,7 @@ static int smmu_pmu_probe(struct platform_device *pdev) smmu_pmu->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = &pdev->dev, .task_ctx_nr = perf_invalid_context, .pmu_enable = smmu_pmu_enable, .pmu_disable = smmu_pmu_disable, From 4052ce07d5d7e8158dfd5c01c514b930cb689f68 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 12 Apr 2024 17:10:50 +0100 Subject: [PATCH 48/64] perf/arm-spe: Assign parents for event_source device Currently the PMU device appears directly under /sys/devices/ Only root busses should appear there, so instead assign the pmu->dev parent to be the platform device. Link: https://lore.kernel.org/linux-cxl/ZCLI9A40PJsyqAmq@kroah.com/ Acked-by: Suzuki K Poulose Signed-off-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240412161057.14099-24-Jonathan.Cameron@huawei.com Signed-off-by: Will Deacon --- drivers/perf/arm_spe_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 35f0de03416f..9100d82bfabc 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -932,6 +932,7 @@ static int arm_spe_pmu_perf_init(struct arm_spe_pmu *spe_pmu) spe_pmu->pmu = (struct pmu) { .module = THIS_MODULE, + .parent = &spe_pmu->pdev->dev, .capabilities = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE, .attr_groups = arm_spe_pmu_attr_groups, /* From b7fab1b69b9c4c152c185af8fb375810e70cb606 Mon Sep 17 00:00:00 2001 From: xieming Date: Mon, 22 Apr 2024 09:57:30 +0800 Subject: [PATCH 49/64] kselftest/arm64: Remove unused parameters in abi test Remove unused parameter i in tpidr2.c main function. Signed-off-by: xieming Link: https://lore.kernel.org/r/20240422015730.89805-1-xieming@kylinos.cn Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 02ee3a91b780..285c47dd42f6 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -262,7 +262,7 @@ static int write_clone_read(void) int main(int argc, char **argv) { - int ret, i; + int ret; putstr("TAP version 13\n"); putstr("1.."); From 12d712dc8e4f1a30b18f8c3789adfbc07f5eb050 Mon Sep 17 00:00:00 2001 From: Shiqi Liu Date: Sun, 21 Apr 2024 14:33:28 +0800 Subject: [PATCH 50/64] arm64/sysreg: Update PIE permission encodings Fix left shift overflow issue when the parameter idx is greater than or equal to 8 in the calculation of perm in PIRx_ELx_PERM macro. Fix this by modifying the encoding to use a long integer type. Signed-off-by: Shiqi Liu Acked-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20240421063328.29710-1-shiqiliu@hust.edu.cn Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 24 ++++++++++++------------ tools/arch/arm64/include/asm/sysreg.h | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9e8999592f3a..af3b206fa423 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1036,18 +1036,18 @@ * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). */ -#define PIE_NONE_O 0x0 -#define PIE_R_O 0x1 -#define PIE_X_O 0x2 -#define PIE_RX_O 0x3 -#define PIE_RW_O 0x5 -#define PIE_RWnX_O 0x6 -#define PIE_RWX_O 0x7 -#define PIE_R 0x8 -#define PIE_GCS 0x9 -#define PIE_RX 0xa -#define PIE_RW 0xc -#define PIE_RWX 0xe +#define PIE_NONE_O UL(0x0) +#define PIE_R_O UL(0x1) +#define PIE_X_O UL(0x2) +#define PIE_RX_O UL(0x3) +#define PIE_RW_O UL(0x5) +#define PIE_RWnX_O UL(0x6) +#define PIE_RWX_O UL(0x7) +#define PIE_R UL(0x8) +#define PIE_GCS UL(0x9) +#define PIE_RX UL(0xa) +#define PIE_RW UL(0xc) +#define PIE_RWX UL(0xe) #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index ccc13e991376..cd8420e8c3ad 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -701,18 +701,18 @@ * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). */ -#define PIE_NONE_O 0x0 -#define PIE_R_O 0x1 -#define PIE_X_O 0x2 -#define PIE_RX_O 0x3 -#define PIE_RW_O 0x5 -#define PIE_RWnX_O 0x6 -#define PIE_RWX_O 0x7 -#define PIE_R 0x8 -#define PIE_GCS 0x9 -#define PIE_RX 0xa -#define PIE_RW 0xc -#define PIE_RWX 0xe +#define PIE_NONE_O UL(0x0) +#define PIE_R_O UL(0x1) +#define PIE_X_O UL(0x2) +#define PIE_RX_O UL(0x3) +#define PIE_RW_O UL(0x5) +#define PIE_RWnX_O UL(0x6) +#define PIE_RWX_O UL(0x7) +#define PIE_R UL(0x8) +#define PIE_GCS UL(0x9) +#define PIE_RX UL(0xa) +#define PIE_RW UL(0xc) +#define PIE_RWX UL(0xe) #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) From 3a2d2ca42975d7550d2ced663c64e54ab83ece68 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 22 Apr 2024 12:35:22 +0100 Subject: [PATCH 51/64] arm64: assembler: update stale comment for disable_step_tsk A comment in the disable_step_tsk macro refers to synchronising with enable_dbg, as historically the entry used enable_dbg to unmask debug exceptions after disabling single-stepping. These days the unmasking happens in entry-common.c via local_daif_restore() or local_daif_inherit(), so the comment is stale. This logic is likely to chang in future, so it would be best to avoid referring to those macros specifically. Update the comment to take this into account, and describe it in terms of clearing DAIF.D so that it doesn't macro where this logic lives nor what it is called. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Mark Brown Cc: Will Deacon Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240422113523.4070414-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index ab8b396428da..b27dac4a9c0f 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -59,7 +59,7 @@ mrs \tmp, mdscr_el1 bic \tmp, \tmp, #DBG_MDSCR_SS msr mdscr_el1, \tmp - isb // Synchronise with enable_dbg + isb // Take effect before a subsequent clear of DAIF.D 9990: .endm From 080297beccf77433053621a222c332ae603a1a84 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 22 Apr 2024 12:35:23 +0100 Subject: [PATCH 52/64] arm64: defer clearing DAIF.D For historical reasons we unmask debug exceptions in __cpu_setup(), but it's not necessary to unmask debug exceptions this early in the boot/idle entry paths. It would be better to unmask debug exceptions later in C code as this simplifies the current code and will make it easier to rework exception masking logic to handle non-DAIF bits in future (e.g. PSTATE.{ALLINT,PM}). We started clearing DAIF.D in __cpu_setup() in commit: 2ce39ad15182604b ("arm64: debug: unmask PSTATE.D earlier") At the time, we needed to ensure that DAIF.D was clear on the primary CPU before scheduling and preemption were possible, and chose to do this in __cpu_setup() so that this occurred in the same place for primary and secondary CPUs. As we cannot handle debug exceptions this early, we placed an ISB between initializing MDSCR_EL1 and clearing DAIF.D so that no exceptions should be triggered. Subsequently we rewrote the return-from-{idle,suspend} paths to use __cpu_setup() in commit: cabe1c81ea5be983 ("arm64: Change cpu_resume() to enable mmu early then access sleep_sp by va") ... which allowed for earlier use of the MMU and had the desirable property of using the same code to reset the CPU in the cold and warm boot paths. This introduced a bug: DAIF.D was clear while cpu_do_resume() restored MDSCR_EL1 and other control registers (e.g. breakpoint/watchpoint control/value registers), and so we could unexpectedly take debug exceptions. We fixed that in commit: 744c6c37cc18705d ("arm64: kernel: Fix unmasked debug exceptions when restoring mdscr_el1") ... by having cpu_do_resume() use the `disable_dbg` macro to set DAIF.D before restoring MDSCR_EL1 and other control registers. This relies on DAIF.D being subsequently cleared again in cpu_resume(). Subsequently we reworked DAIF masking in commit: 0fbeb318754860b3 ("arm64: explicitly mask all exceptions") ... where we began enforcing a policy that DAIF.D being set implies all other DAIF bits are set, and so e.g. we cannot take an IRQ while DAIF.D is set. As part of this the use of `disable_dbg` in cpu_resume() was replaced with `disable_daif` for consistency with the rest of the kernel. These days, there's no need to clear DAIF.D early within __cpu_setup(): * setup_arch() clears DAIF.DA before scheduling and preemption are possible on the primary CPU, avoiding the problem we we originally trying to work around. Note: DAIF.IF get cleared later when interrupts are enabled for the first time. * secondary_start_kernel() clears all DAIF bits before scheduling and preemption are possible on secondary CPUs. Note: with pseudo-NMI, the PMR is initialized here before any DAIF bits are cleared. Similar will be necessary for the architectural NMI. * cpu_suspend() restores all DAIF bits when returning from idle, ensuring that we don't unexpectedly leave DAIF.D clear or set. Note: with pseudo-NMI, the PMR is initialized here before DAIF is cleared. Similar will be necessary for the architectural NMI. This patch removes the unmasking of debug exceptions from __cpu_setup(), relying on the above locations to initialize DAIF. This allows some other cleanups: * It is no longer necessary for cpu_resume() to explicitly mask debug (or other) exceptions, as it is always called with all DAIF bits set. Thus we drop the use of `disable_daif`. * The `enable_dbg` macro is no longer used, and so is dropped. * It is no longer necessary to have an ISB immediately after initializing MDSCR_EL1 in __cpu_setup(), and we can revert to relying on the context synchronization that occurs when the MMU is enabled between __cpu_setup() and code which clears DAIF.D Comments are added to setup_arch() and secondary_start_kernel() to explain the initial unmasking of the DAIF bits. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Mark Brown Cc: Will Deacon Link: https://lore.kernel.org/r/20240422113523.4070414-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 4 ---- arch/arm64/kernel/setup.c | 11 +++++++++-- arch/arm64/kernel/smp.c | 7 +++++++ arch/arm64/mm/proc.S | 10 ---------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index b27dac4a9c0f..6f9ad2d2bb40 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -50,10 +50,6 @@ msr daif, \flags .endm - .macro enable_dbg - msr daifclr, #8 - .endm - .macro disable_step_tsk, flgs, tmp tbz \flgs, #TIF_SINGLESTEP, 9990f mrs \tmp, mdscr_el1 diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 65a052bf741f..a096e2451044 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -298,8 +298,15 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) dynamic_scs_init(); /* - * Unmask SError as soon as possible after initializing earlycon so - * that we can report any SErrors immediately. + * The primary CPU enters the kernel with all DAIF exceptions masked. + * + * We must unmask Debug and SError before preemption or scheduling is + * possible to ensure that these are consistently unmasked across + * threads, and we want to unmask SError as soon as possible after + * initializing earlycon so that we can report any SErrors immediately. + * + * IRQ and FIQ will be unmasked after the root irqchip has been + * detected and initialized. */ local_daif_restore(DAIF_PROCCTX_NOIRQ); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 4ced34f62dab..31c8b3094dd7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -264,6 +264,13 @@ asmlinkage notrace void secondary_start_kernel(void) set_cpu_online(cpu, true); complete(&cpu_running); + /* + * Secondary CPUs enter the kernel with all DAIF exceptions masked. + * + * As with setup_arch() we must unmask Debug and SError exceptions, and + * as the root irqchip has already been detected and initialized we can + * unmask IRQ and FIQ at the same time. + */ local_daif_restore(DAIF_PROCCTX); /* diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 9d40f3ffd8d2..f4bc6c5bac06 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -135,14 +135,6 @@ SYM_FUNC_START(cpu_do_resume) msr tcr_el1, x8 msr vbar_el1, x9 - - /* - * __cpu_setup() cleared MDSCR_EL1.MDE and friends, before unmasking - * debug exceptions. By restoring MDSCR_EL1 here, we may take a debug - * exception. Mask them until local_daif_restore() in cpu_suspend() - * resets them. - */ - disable_daif msr mdscr_el1, x10 msr sctlr_el1, x12 @@ -466,8 +458,6 @@ SYM_FUNC_START(__cpu_setup) msr cpacr_el1, xzr // Reset cpacr_el1 mov x1, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x1 // access to the DCC from EL0 - isb // Unmask debug exceptions now, - enable_dbg // since this is per-cpu reset_pmuserenr_el0 x1 // Disable PMU access from EL0 reset_amuserenr_el0 x1 // Disable AMU access from EL0 From 80164282b3620a3cb73de6ffda5592743e448d0e Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Tue, 23 Apr 2024 16:21:02 +0800 Subject: [PATCH 53/64] kselftest: arm64: Add a null pointer check There is a 'malloc' call, which can be unsuccessful. This patch will add the malloc failure checking to avoid possible null dereference and give more information about test fail reasons. Signed-off-by: Kunwu Chan Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240423082102.2018886-1-chentao@kylinos.cn Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/tags/tags_test.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/arm64/tags/tags_test.c b/tools/testing/selftests/arm64/tags/tags_test.c index 5701163460ef..955f87c1170d 100644 --- a/tools/testing/selftests/arm64/tags/tags_test.c +++ b/tools/testing/selftests/arm64/tags/tags_test.c @@ -6,6 +6,7 @@ #include #include #include +#include "../../kselftest.h" #define SHIFT_TAG(tag) ((uint64_t)(tag) << 56) #define SET_TAG(ptr, tag) (((uint64_t)(ptr) & ~SHIFT_TAG(0xff)) | \ @@ -21,6 +22,9 @@ int main(void) if (prctl(PR_SET_TAGGED_ADDR_CTRL, PR_TAGGED_ADDR_ENABLE, 0, 0, 0) == 0) tbi_enabled = 1; ptr = (struct utsname *)malloc(sizeof(*ptr)); + if (!ptr) + ksft_exit_fail_msg("Failed to allocate utsname buffer\n"); + if (tbi_enabled) tag = 0x42; ptr = (struct utsname *)SET_TAG(ptr, tag); From 77fce82678ea5fd51442e62febec2004f79e041b Mon Sep 17 00:00:00 2001 From: Junhao He Date: Thu, 25 Apr 2024 20:46:25 +0800 Subject: [PATCH 54/64] drivers/perf: hisi_pcie: Fix out-of-bound access when valid event group The perf tool allows users to create event groups through following cmd [1], but the driver does not check whether the array index is out of bounds when writing data to the event_group array. If the number of events in an event_group is greater than HISI_PCIE_MAX_COUNTERS, the memory write overflow of event_group array occurs. Add array index check to fix the possible array out of bounds violation, and return directly when write new events are written to array bounds. There are 9 different events in an event_group. [1] perf stat -e '{pmu/event1/, ... ,pmu/event9/}' Fixes: 8404b0fbc7fb ("drivers/perf: hisi: Add driver for HiSilicon PCIe PMU") Signed-off-by: Junhao He Reviewed-by: Jijie Shao Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240425124627.13764-2-hejunhao3@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_pcie_pmu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c index b1e4739fbdb0..03c506aa3853 100644 --- a/drivers/perf/hisilicon/hisi_pcie_pmu.c +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -350,15 +350,27 @@ static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event) return false; for (num = 0; num < counters; num++) { + /* + * If we find a related event, then it's a valid group + * since we don't need to allocate a new counter for it. + */ if (hisi_pcie_pmu_cmp_event(event_group[num], sibling)) break; } + /* + * Otherwise it's a new event but if there's no available counter, + * fail the check since we cannot schedule all the events in + * the group simultaneously. + */ + if (num == HISI_PCIE_MAX_COUNTERS) + return false; + if (num == counters) event_group[counters++] = sibling; } - return counters <= HISI_PCIE_MAX_COUNTERS; + return true; } static int hisi_pcie_pmu_event_init(struct perf_event *event) From 81bdd60a3d1d3b05e6cc6674845afb1694dd3a0e Mon Sep 17 00:00:00 2001 From: Junhao He Date: Thu, 25 Apr 2024 20:46:26 +0800 Subject: [PATCH 55/64] drivers/perf: hisi: hns3: Fix out-of-bound access when valid event group The perf tool allows users to create event groups through following cmd [1], but the driver does not check whether the array index is out of bounds when writing data to the event_group array. If the number of events in an event_group is greater than HNS3_PMU_MAX_HW_EVENTS, the memory write overflow of event_group array occurs. Add array index check to fix the possible array out of bounds violation, and return directly when write new events are written to array bounds. There are 9 different events in an event_group. [1] perf stat -e '{pmu/event1/, ... ,pmu/event9/} Fixes: 66637ab137b4 ("drivers/perf: hisi: add driver for HNS3 PMU") Signed-off-by: Junhao He Signed-off-by: Hao Chen Acked-by: Jonathan Cameron Reviewed-by: Jijie Shao Link: https://lore.kernel.org/r/20240425124627.13764-3-hejunhao3@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hns3_pmu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/perf/hisilicon/hns3_pmu.c b/drivers/perf/hisilicon/hns3_pmu.c index 5236acdcc2e1..0417bf23fa3e 100644 --- a/drivers/perf/hisilicon/hns3_pmu.c +++ b/drivers/perf/hisilicon/hns3_pmu.c @@ -1085,15 +1085,27 @@ static bool hns3_pmu_validate_event_group(struct perf_event *event) return false; for (num = 0; num < counters; num++) { + /* + * If we find a related event, then it's a valid group + * since we don't need to allocate a new counter for it. + */ if (hns3_pmu_cmp_event(event_group[num], sibling)) break; } + /* + * Otherwise it's a new event but if there's no available counter, + * fail the check since we cannot schedule all the events in + * the group simultaneously. + */ + if (num == HNS3_PMU_MAX_HW_EVENTS) + return false; + if (num == counters) event_group[counters++] = sibling; } - return counters <= HNS3_PMU_MAX_HW_EVENTS; + return true; } static u32 hns3_pmu_get_filter_condition(struct perf_event *event) From 582c1aeee0a9e73010cf1c4cef338709860deeb0 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 25 Apr 2024 20:46:27 +0800 Subject: [PATCH 56/64] drivers/perf: hisi: hns3: Actually use devm_add_action_or_reset() pci_alloc_irq_vectors() allocates an irq vector. When devm_add_action() fails, the irq vector is not freed, which leads to a memory leak. Replace the devm_add_action with devm_add_action_or_reset to ensure the irq vector can be destroyed when it fails. Fixes: 66637ab137b4 ("drivers/perf: hisi: add driver for HNS3 PMU") Signed-off-by: Hao Chen Signed-off-by: Junhao He Reviewed-by: Jijie Shao Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240425124627.13764-4-hejunhao3@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hns3_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/hisilicon/hns3_pmu.c b/drivers/perf/hisilicon/hns3_pmu.c index 0417bf23fa3e..e900f8e00b18 100644 --- a/drivers/perf/hisilicon/hns3_pmu.c +++ b/drivers/perf/hisilicon/hns3_pmu.c @@ -1528,7 +1528,7 @@ static int hns3_pmu_irq_register(struct pci_dev *pdev, return ret; } - ret = devm_add_action(&pdev->dev, hns3_pmu_free_irq, pdev); + ret = devm_add_action_or_reset(&pdev->dev, hns3_pmu_free_irq, pdev); if (ret) { pci_err(pdev, "failed to add free irq action, ret = %d.\n", ret); return ret; From 1279e8d0dcead53cf1f51e926a1cf6d2a79332d6 Mon Sep 17 00:00:00 2001 From: Andrea della Porta Date: Mon, 29 Apr 2024 12:28:33 +0200 Subject: [PATCH 57/64] arm64: Add the arm64.no32bit_el0 command line option Introducing the field 'el0' to the idreg-override for register ID_AA64PFR0_EL1. This field is also aliased to the new kernel command line option 'arm64.no32bit_el0' as a more recognizable and mnemonic name to disable the execution of 32 bit userspace applications (i.e. avoid Aarch32 execution state in EL0) from kernel command line. Link: https://lore.kernel.org/all/20240207105847.7739-1-andrea.porta@suse.com/ Signed-off-by: Andrea della Porta Link: https://lore.kernel.org/r/20240429102833.6426-1-andrea.porta@suse.com Signed-off-by: Will Deacon --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/kernel/pi/idreg-override.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 623fce7d5fcd..afd6fcc1e8b9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -431,6 +431,9 @@ arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards Format: ,, + arm64.no32bit_el0 [ARM64] Unconditionally disable the execution of + 32 bit applications. + arm64.nobti [ARM64] Unconditionally disable Branch Target Identification support diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index aad399796e81..48c1aa456af9 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -108,6 +108,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .override = &id_aa64pfr0_override, .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), + FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), {} }, }; @@ -223,6 +224,7 @@ static const struct { { "nokaslr", "arm64_sw.nokaslr=1" }, { "rodata=off", "arm64_sw.rodataoff=1" }, { "arm64.nolva", "id_aa64mmfr2.varange=0" }, + { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, }; static int __init parse_hexdigit(const char *p, u64 *v) From 410e471f87465f04d7ae7f8ed16ef8e7a3b5517c Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Tue, 19 Dec 2023 10:22:29 +0800 Subject: [PATCH 58/64] arm64: Add USER_STACKTRACE support Currently, userstacktrace is unsupported for ftrace and uprobe tracers on arm64. This patch uses the perf_callchain_user() code as blueprint to implement the arch_stack_walk_user() which add userstacktrace support on arm64. Meanwhile, we can use arch_stack_walk_user() to simplify the implementation of perf_callchain_user(). This patch is tested pass with ftrace, uprobe and perf tracers profiling userstacktrace cases. Tested-by: chenqiwu Signed-off-by: chenqiwu Link: https://lore.kernel.org/r/20231219022229.10230-1-qiwu.chen@transsion.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + arch/arm64/kernel/perf_callchain.c | 118 +--------------------------- arch/arm64/kernel/stacktrace.c | 120 +++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 114 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b11c98b3e84..7f7bbee7d257 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -258,6 +258,7 @@ config ARM64 select TRACE_IRQFLAGS_SUPPORT select TRACE_IRQFLAGS_NMI_SUPPORT select HAVE_SOFTIRQ_ON_OWN_STACK + select USER_STACKTRACE_SUPPORT help ARM 64-bit (AArch64) Linux support. diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 6d157f32187b..e8ed5673f481 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -10,95 +10,13 @@ #include -struct frame_tail { - struct frame_tail __user *fp; - unsigned long lr; -} __attribute__((packed)); - -/* - * Get the return address for a single stackframe and return a pointer to the - * next frame tail. - */ -static struct frame_tail __user * -user_backtrace(struct frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) +static bool callchain_trace(void *data, unsigned long pc) { - struct frame_tail buftail; - unsigned long err; - unsigned long lr; + struct perf_callchain_entry_ctx *entry = data; - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - lr = ptrauth_strip_user_insn_pac(buftail.lr); - - perf_callchain_store(entry, lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail >= buftail.fp) - return NULL; - - return buftail.fp; + return perf_callchain_store(entry, pc) == 0; } -#ifdef CONFIG_COMPAT -/* - * The registers we're interested in are at the end of the variable - * length saved register structure. The fp points at the end of this - * structure so the address of this struct is: - * (struct compat_frame_tail *)(xxx->fp)-1 - * - * This code has been adapted from the ARM OProfile support. - */ -struct compat_frame_tail { - compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ - u32 sp; - u32 lr; -} __attribute__((packed)); - -static struct compat_frame_tail __user * -compat_user_backtrace(struct compat_frame_tail __user *tail, - struct perf_callchain_entry_ctx *entry) -{ - struct compat_frame_tail buftail; - unsigned long err; - - /* Also check accessibility of one struct frame_tail beyond */ - if (!access_ok(tail, sizeof(buftail))) - return NULL; - - pagefault_disable(); - err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); - pagefault_enable(); - - if (err) - return NULL; - - perf_callchain_store(entry, buftail.lr); - - /* - * Frame pointers should strictly progress back up the stack - * (towards higher addresses). - */ - if (tail + 1 >= (struct compat_frame_tail __user *) - compat_ptr(buftail.fp)) - return NULL; - - return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; -} -#endif /* CONFIG_COMPAT */ - void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { @@ -107,35 +25,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, return; } - perf_callchain_store(entry, regs->pc); - - if (!compat_user_mode(regs)) { - /* AARCH64 mode */ - struct frame_tail __user *tail; - - tail = (struct frame_tail __user *)regs->regs[29]; - - while (entry->nr < entry->max_stack && - tail && !((unsigned long)tail & 0x7)) - tail = user_backtrace(tail, entry); - } else { -#ifdef CONFIG_COMPAT - /* AARCH32 compat mode */ - struct compat_frame_tail __user *tail; - - tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - - while ((entry->nr < entry->max_stack) && - tail && !((unsigned long)tail & 0x3)) - tail = compat_user_backtrace(tail, entry); -#endif - } -} - -static bool callchain_trace(void *data, unsigned long pc) -{ - struct perf_callchain_entry_ctx *entry = data; - return perf_callchain_store(entry, pc) == 0; + arch_stack_walk_user(callchain_trace, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 684c26511696..6b3258860377 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -324,3 +324,123 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) dump_backtrace(NULL, tsk, loglvl); barrier(); } + +/* + * The struct defined for userspace stack frame in AARCH64 mode. + */ +struct frame_tail { + struct frame_tail __user *fp; + unsigned long lr; +} __attribute__((packed)); + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static struct frame_tail __user * +unwind_user_frame(struct frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) +{ + struct frame_tail buftail; + unsigned long err; + unsigned long lr; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + lr = ptrauth_strip_user_insn_pac(buftail.lr); + + if (!consume_entry(cookie, lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail >= buftail.fp) + return NULL; + + return buftail.fp; +} + +#ifdef CONFIG_COMPAT +/* + * The registers we're interested in are at the end of the variable + * length saved register structure. The fp points at the end of this + * structure so the address of this struct is: + * (struct compat_frame_tail *)(xxx->fp)-1 + * + * This code has been adapted from the ARM OProfile support. + */ +struct compat_frame_tail { + compat_uptr_t fp; /* a (struct compat_frame_tail *) in compat mode */ + u32 sp; + u32 lr; +} __attribute__((packed)); + +static struct compat_frame_tail __user * +unwind_compat_user_frame(struct compat_frame_tail __user *tail, void *cookie, + stack_trace_consume_fn consume_entry) +{ + struct compat_frame_tail buftail; + unsigned long err; + + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(tail, sizeof(buftail))) + return NULL; + + pagefault_disable(); + err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail)); + pagefault_enable(); + + if (err) + return NULL; + + if (!consume_entry(cookie, buftail.lr)) + return NULL; + + /* + * Frame pointers should strictly progress back up the stack + * (towards higher addresses). + */ + if (tail + 1 >= (struct compat_frame_tail __user *) + compat_ptr(buftail.fp)) + return NULL; + + return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1; +} +#endif /* CONFIG_COMPAT */ + + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + if (!consume_entry(cookie, regs->pc)) + return; + + if (!compat_user_mode(regs)) { + /* AARCH64 mode */ + struct frame_tail __user *tail; + + tail = (struct frame_tail __user *)regs->regs[29]; + while (tail && !((unsigned long)tail & 0x7)) + tail = unwind_user_frame(tail, cookie, consume_entry); + } else { +#ifdef CONFIG_COMPAT + /* AARCH32 compat mode */ + struct compat_frame_tail __user *tail; + + tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; + while (tail && !((unsigned long)tail & 0x3)) + tail = unwind_compat_user_frame(tail, cookie, consume_entry); +#endif + } +} From 588de8c6d3621a4d712ccf834c205a74a84180a8 Mon Sep 17 00:00:00 2001 From: George Guo Date: Tue, 30 Apr 2024 16:56:55 +0800 Subject: [PATCH 59/64] arm64: simplify arch_static_branch/_jump function Extracted the jump table definition code from the arch_static_branch and arch_static_branch_jump functions into a macro JUMP_TABLE_ENTRY to reduce code duplication. Signed-off-by: George Guo Link: https://lore.kernel.org/r/20240430085655.2798551-2-dongtai.guo@linux.dev Signed-off-by: Will Deacon --- arch/arm64/include/asm/jump_label.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index 6aafbb789991..4e753908b801 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -15,17 +15,23 @@ #define JUMP_LABEL_NOP_SIZE AARCH64_INSN_SIZE +#define JUMP_TABLE_ENTRY(key, label) \ + ".pushsection __jump_table, \"aw\"\n\t" \ + ".align 3\n\t" \ + ".long 1b - ., %l["#label"] - .\n\t" \ + ".quad %c0 - .\n\t" \ + ".popsection\n\t" \ + : : "i"(key) : : label + static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { + char *k = &((char *)key)[branch]; + asm goto( "1: nop \n\t" - " .pushsection __jump_table, \"aw\" \n\t" - " .align 3 \n\t" - " .long 1b - ., %l[l_yes] - . \n\t" - " .quad %c0 - . \n\t" - " .popsection \n\t" - : : "i"(&((char *)key)[branch]) : : l_yes); + JUMP_TABLE_ENTRY(k, l_yes) + ); return false; l_yes: @@ -35,15 +41,11 @@ static __always_inline bool arch_static_branch(struct static_key * const key, static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { + char *k = &((char *)key)[branch]; asm goto( "1: b %l[l_yes] \n\t" - " .pushsection __jump_table, \"aw\" \n\t" - " .align 3 \n\t" - " .long 1b - ., %l[l_yes] - . \n\t" - " .quad %c0 - . \n\t" - " .popsection \n\t" - : : "i"(&((char *)key)[branch]) : : l_yes); - + JUMP_TABLE_ENTRY(k, l_yes) + ); return false; l_yes: return true; From b28c74e259675aa0eade6be5d5efaa4d72e06c83 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 May 2024 15:45:59 +0100 Subject: [PATCH 60/64] arm64/mm: generalize PMD_PRESENT_INVALID for all levels As preparation for the next patch, which frees up the PTE_PROT_NONE present pte and swap pte bit, generalize PMD_PRESENT_INVALID to PTE_PRESENT_INVALID. This will then be used to mark PROT_NONE ptes (and entries at any other level) in the next patch. While we're at it, fix up the swap pte format comment to include PTE_PRESENT_INVALID. This is not new, it just wasn't previously documented. Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20240503144604.151095-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 8 ++++---- arch/arm64/include/asm/pgtable.h | 21 ++++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index dd9ee67d1d87..cdbf51eef7a6 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -21,11 +21,11 @@ #define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */ /* - * This bit indicates that the entry is present i.e. pmd_page() - * still points to a valid huge page in memory even if the pmd - * has been invalidated. + * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be + * interpreted according to the HW layout by SW but any attempted HW access to + * the address will result in a fault. pte_present() returns true. */ -#define PMD_PRESENT_INVALID (_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */ +#define PTE_PRESENT_INVALID (_AT(pteval_t, 1) << 59) /* only when !PTE_VALID */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 92c9aed5e7af..a3e417f388e9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -132,6 +132,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) +#define pte_present_invalid(pte) \ + ((pte_val(pte) & (PTE_VALID | PTE_PRESENT_INVALID)) == PTE_PRESENT_INVALID) /* * Execute-only user mappings do not have the PTE_USER bit set. All valid * kernel mappings have the PTE_UXN bit set. @@ -261,6 +263,13 @@ static inline pte_t pte_mkpresent(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_VALID)); } +static inline pte_t pte_mkinvalid(pte_t pte) +{ + pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID)); + pte = clear_pte_bit(pte, __pgprot(PTE_VALID)); + return pte; +} + static inline pmd_t pmd_mkcont(pmd_t pmd) { return __pmd(pmd_val(pmd) | PMD_SECT_CONT); @@ -483,7 +492,7 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -#define pmd_present_invalid(pmd) (!!(pmd_val(pmd) & PMD_PRESENT_INVALID)) +#define pmd_present_invalid(pmd) pte_present_invalid(pmd_pte(pmd)) static inline int pmd_present(pmd_t pmd) { @@ -513,14 +522,7 @@ static inline int pmd_trans_huge(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) - -static inline pmd_t pmd_mkinvalid(pmd_t pmd) -{ - pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID)); - pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID)); - - return pmd; -} +#define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd))) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) @@ -1258,6 +1260,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * bits 3-7: swap type * bits 8-57: swap offset * bit 58: PTE_PROT_NONE (must be zero) + * bit 59: PTE_PRESENT_INVALID (must be zero) */ #define __SWP_TYPE_SHIFT 3 #define __SWP_TYPE_BITS 5 From f0f5863a0fb0fb48a5881c3f6acca1958899dd76 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 May 2024 15:46:00 +0100 Subject: [PATCH 61/64] arm64/mm: Remove PTE_PROT_NONE bit Currently the PTE_PRESENT_INVALID and PTE_PROT_NONE functionality explicitly occupy 2 bits in the PTE when PTE_VALID/PMD_SECT_VALID is clear. This has 2 significant consequences: - PTE_PROT_NONE consumes a precious SW PTE bit that could be used for other things. - The swap pte layout must reserve those same 2 bits and ensure they are both always zero for a swap pte. It would be nice to reclaim at least one of those bits. But PTE_PRESENT_INVALID, which since the previous patch, applies uniformly to page/block descriptors at any level when PTE_VALID is clear, can already give us most of what PTE_PROT_NONE requires: If it is set, then the pte is still considered present; pte_present() returns true and all the fields in the pte follow the HW interpretation (e.g. SW can safely call pte_pfn(), etc). But crucially, the HW treats the pte as invalid and will fault if it hits. So let's remove PTE_PROT_NONE entirely and instead represent PROT_NONE as a present but invalid pte (PTE_VALID=0, PTE_PRESENT_INVALID=1) with PTE_USER=0 and PTE_UXN=1. This is a unique combination that is not used anywhere else. The net result is a clearer, simpler, more generic encoding scheme that applies uniformly to all levels. Additionally we free up a PTE SW bit and a swap pte bit (bit 58 in both cases). Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20240503144604.151095-3-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 3 +-- arch/arm64/include/asm/pgtable.h | 31 +++++++++++++++------------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index cdbf51eef7a6..81f07b44f7b8 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -18,7 +18,6 @@ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) -#define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */ /* * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be @@ -103,7 +102,7 @@ static inline bool __pure lpa2_is_enabled(void) __val; \ }) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_SHARED) #define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index a3e417f388e9..300abbc0e67a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -105,7 +105,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys) /* * The following only work if pte_present(). Undefined behaviour otherwise. */ -#define pte_present(pte) (!!(pte_val(pte) & (PTE_VALID | PTE_PROT_NONE))) +#define pte_present(pte) (pte_valid(pte) || pte_present_invalid(pte)) #define pte_young(pte) (!!(pte_val(pte) & PTE_AF)) #define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL)) #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) @@ -483,7 +483,16 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) */ static inline int pte_protnone(pte_t pte) { - return (pte_val(pte) & (PTE_VALID | PTE_PROT_NONE)) == PTE_PROT_NONE; + /* + * pte_present_invalid() tells us that the pte is invalid from HW + * perspective but present from SW perspective, so the fields are to be + * interpretted as per the HW layout. The second 2 checks are the unique + * encoding that we use for PROT_NONE. It is insufficient to only use + * the first check because we share the same encoding scheme with pmds + * which support pmd_mkinvalid(), so can be present-invalid without + * being PROT_NONE. + */ + return pte_present_invalid(pte) && !pte_user(pte) && !pte_user_exec(pte); } static inline int pmd_protnone(pmd_t pmd) @@ -492,12 +501,7 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -#define pmd_present_invalid(pmd) pte_present_invalid(pmd_pte(pmd)) - -static inline int pmd_present(pmd_t pmd) -{ - return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd); -} +#define pmd_present(pmd) pte_present(pmd_pte(pmd)) /* * THP definitions. @@ -1036,8 +1040,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK. */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | - PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP | - PTE_ATTRINDX_MASK; + PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE | + PTE_GP | PTE_ATTRINDX_MASK; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); @@ -1085,17 +1089,17 @@ static inline int pgd_devmap(pgd_t pgd) #ifdef CONFIG_PAGE_TABLE_CHECK static inline bool pte_user_accessible_page(pte_t pte) { - return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte)); + return pte_valid(pte) && (pte_user(pte) || pte_user_exec(pte)); } static inline bool pmd_user_accessible_page(pmd_t pmd) { - return pmd_leaf(pmd) && !pmd_present_invalid(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); + return pmd_valid(pmd) && !pmd_table(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); } static inline bool pud_user_accessible_page(pud_t pud) { - return pud_leaf(pud) && (pud_user(pud) || pud_user_exec(pud)); + return pud_valid(pud) && !pud_table(pud) && (pud_user(pud) || pud_user_exec(pud)); } #endif @@ -1259,7 +1263,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * bits 2: remember PG_anon_exclusive * bits 3-7: swap type * bits 8-57: swap offset - * bit 58: PTE_PROT_NONE (must be zero) * bit 59: PTE_PRESENT_INVALID (must be zero) */ #define __SWP_TYPE_SHIFT 3 From 55564814a838f1d2429dc757294df798f5262bd2 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 May 2024 15:46:01 +0100 Subject: [PATCH 62/64] arm64/mm: Move PTE_PRESENT_INVALID to overlay PTE_NG PTE_PRESENT_INVALID was previously occupying bit 59, which when a PTE is valid can either be IGNORED, PBHA[0] or AttrIndex[3], depending on the HW configuration. In practice this is currently not a problem because PTE_PRESENT_INVALID can only be 1 when PTE_VALID=0 and upstream Linux always requires the bit set to 0 for a valid pte. However, if in future Linux wants to use the field (e.g. AttrIndex[3]) then we could end up with confusion when PTE_PRESENT_INVALID comes along and corrupts the field - we would ideally want to preserve it even for an invalid (but present) pte. The other problem with bit 59 is that it prevents the offset field of a swap entry within a swap pte from growing beyond 51 bits. By moving PTE_PRESENT_INVALID to a low bit we can lay the swap pte out so that the offset field could grow to 52 bits in future. So let's move PTE_PRESENT_INVALID to overlay PTE_NG (bit 11). There is no need to persist NG for a present-invalid entry; it is always set for user mappings and is not used by SW to derive any state from the pte. PTE_NS was considered instead of PTE_NG, but it is RES0 for non-secure SW, so there is a chance that future architecture may allocate the bit and we may therefore need to persist that bit for present-invalid ptes. These are both marginal benefits, but make things a bit tidier in my opinion. Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20240503144604.151095-4-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 2 +- arch/arm64/include/asm/pgtable.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 81f07b44f7b8..35c9de13f7ed 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -24,7 +24,7 @@ * interpreted according to the HW layout by SW but any attempted HW access to * the address will result in a fault. pte_present() returns true. */ -#define PTE_PRESENT_INVALID (_AT(pteval_t, 1) << 59) /* only when !PTE_VALID */ +#define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 300abbc0e67a..6c0e322211cb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1261,15 +1261,15 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * Encode and decode a swap entry: * bits 0-1: present (must be zero) * bits 2: remember PG_anon_exclusive - * bits 3-7: swap type - * bits 8-57: swap offset - * bit 59: PTE_PRESENT_INVALID (must be zero) + * bits 6-10: swap type + * bit 11: PTE_PRESENT_INVALID (must be zero) + * bits 12-61: swap offset */ -#define __SWP_TYPE_SHIFT 3 +#define __SWP_TYPE_SHIFT 6 #define __SWP_TYPE_BITS 5 -#define __SWP_OFFSET_BITS 50 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) -#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) +#define __SWP_OFFSET_SHIFT 12 +#define __SWP_OFFSET_BITS 50 #define __SWP_OFFSET_MASK ((1UL << __SWP_OFFSET_BITS) - 1) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) From 5b32510af77bdb275b022dc0d6d5b9c61751065b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 May 2024 15:46:02 +0100 Subject: [PATCH 63/64] arm64/mm: Add uffd write-protect support Let's use the newly-free PTE SW bit (58) to add support for uffd-wp. The standard handlers are implemented for set/test/clear for both pte and pmd. Additionally we must also track the uffd-wp state as a pte swp bit, so use a free swap pte bit (3). Acked-by: Peter Xu Reviewed-by: Catalin Marinas Reviewed-by: David Hildenbrand Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20240503144604.151095-5-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable-prot.h | 8 +++++ arch/arm64/include/asm/pgtable.h | 44 +++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b11c98b3e84..763e221f2169 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -255,6 +255,7 @@ config ARM64 select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD + select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD select TRACE_IRQFLAGS_SUPPORT select TRACE_IRQFLAGS_NMI_SUPPORT select HAVE_SOFTIRQ_ON_OWN_STACK diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 35c9de13f7ed..b11cfb9fdd37 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -26,6 +26,14 @@ */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +#define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ +#define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ +#else +#define PTE_UFFD_WP (_AT(pteval_t, 0)) +#define PTE_SWP_UFFD_WP (_AT(pteval_t, 0)) +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6c0e322211cb..78fd3e2ef8c6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -280,6 +280,23 @@ static inline pte_t pte_mkdevmap(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pte_uffd_wp(pte_t pte) +{ + return !!(pte_val(pte) & PTE_UFFD_WP); +} + +static inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte_wrprotect(set_pte_bit(pte, __pgprot(PTE_UFFD_WP))); +} + +static inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(PTE_UFFD_WP)); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline void __set_pte_nosync(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); @@ -477,6 +494,23 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP)); +} + +static inline int pte_swp_uffd_wp(pte_t pte) +{ + return !!(pte_val(pte) & PTE_SWP_UFFD_WP); +} + +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP)); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h @@ -527,6 +561,15 @@ static inline int pmd_trans_huge(pmd_t pmd) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd))) +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +#define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd)) +#define pmd_mkuffd_wp(pmd) pte_pmd(pte_mkuffd_wp(pmd_pte(pmd))) +#define pmd_clear_uffd_wp(pmd) pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd))) +#define pmd_swp_uffd_wp(pmd) pte_swp_uffd_wp(pmd_pte(pmd)) +#define pmd_swp_mkuffd_wp(pmd) pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd))) +#define pmd_swp_clear_uffd_wp(pmd) \ + pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd))) +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) @@ -1261,6 +1304,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * Encode and decode a swap entry: * bits 0-1: present (must be zero) * bits 2: remember PG_anon_exclusive + * bit 3: remember uffd-wp state * bits 6-10: swap type * bit 11: PTE_PRESENT_INVALID (must be zero) * bits 12-61: swap offset From cb67ea121cdd3e4ecea306fbb0058d031d5ad950 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 9 May 2024 13:28:42 +0100 Subject: [PATCH 64/64] arm64/mm: Fix pud_user_accessible_page() for PGTABLE_LEVELS <= 2 The recent change to use pud_valid() as part of the implementation of pud_user_accessible_page() fails to build when PGTABLE_LEVELS <= 2 because pud_valid() is not defined in that case. Fix this by defining pud_valid() to false for this case. This means that pud_user_accessible_page() will correctly always return false for this config. Fixes: f0f5863a0fb0 ("arm64/mm: Remove PTE_PROT_NONE bit") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405082221.43rfWxz5-lkp@intel.com/ Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20240509122844.563320-1-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 78fd3e2ef8c6..bde9fd179388 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -814,6 +814,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #else +#define pud_valid(pud) false #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) #define pud_user_exec(pud) pud_user(pud) /* Always 0 with folding */