From d9fef76e89498bf99cdb03f77b7091d7e95d7edd Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Thu, 16 May 2024 12:44:37 +0200 Subject: [PATCH 1/5] thermal/drivers/mediatek/lvts_thermal: Remove filtered mode for mt8188 Filtered mode is not supported on mt8188 SoC and is the source of bad results. Move to immediate mode which provides good temperatures. Fixes: f4745f546e60 ("thermal/drivers/mediatek/lvts_thermal: Add MT8188 support") Reviewed-by: Nicolas Pitre Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Julien Panis Link: https://lore.kernel.org/r/20240516-mtk-thermal-mt8188-mode-fix-v2-1-40a317442c62@baylibre.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 0bb3a495b56e..82c355c466cf 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -1458,7 +1458,6 @@ static const struct lvts_ctrl_data mt8188_lvts_mcu_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 1, 1), .offset = 0x0, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1469,7 +1468,6 @@ static const struct lvts_ctrl_data mt8188_lvts_mcu_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x100, - .mode = LVTS_MSR_FILTERED_MODE, } }; @@ -1483,7 +1481,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(0, 1, 0, 0), .offset = 0x0, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1496,7 +1493,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 1, 0), .offset = 0x100, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1507,7 +1503,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x200, - .mode = LVTS_MSR_FILTERED_MODE, }, { .lvts_sensor = { @@ -1518,7 +1513,6 @@ static const struct lvts_ctrl_data mt8188_lvts_ap_data_ctrl[] = { }, VALID_SENSOR_MAP(1, 1, 0, 0), .offset = 0x300, - .mode = LVTS_MSR_FILTERED_MODE, } }; From 72cacd06e47d86d89b0e7179fbc9eb3a0f39cd93 Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Tue, 4 Jun 2024 18:46:58 +0200 Subject: [PATCH 2/5] thermal/drivers/mediatek/lvts_thermal: Return error in case of invalid efuse data This patch prevents from registering thermal entries and letting the driver misbehave if efuse data is invalid. A device is not properly calibrated if the golden temperature is zero. Fixes: f5f633b18234 ("thermal/drivers/mediatek: Add the Low Voltage Thermal Sensor driver") Signed-off-by: Julien Panis Reviewed-by: Nicolas Pitre Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20240604-mtk-thermal-calib-check-v2-1-8f258254051d@baylibre.com Signed-off-by: Daniel Lezcano --- drivers/thermal/mediatek/lvts_thermal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/mediatek/lvts_thermal.c b/drivers/thermal/mediatek/lvts_thermal.c index 82c355c466cf..819ed0110f3e 100644 --- a/drivers/thermal/mediatek/lvts_thermal.c +++ b/drivers/thermal/mediatek/lvts_thermal.c @@ -769,7 +769,11 @@ static int lvts_golden_temp_init(struct device *dev, u8 *calib, */ gt = (((u32 *)calib)[0] >> lvts_data->gt_calib_bit_offset) & 0xff; - if (gt && gt < LVTS_GOLDEN_TEMP_MAX) + /* A zero value for gt means that device has invalid efuse data */ + if (!gt) + return -ENODATA; + + if (gt < LVTS_GOLDEN_TEMP_MAX) golden_temp = gt; golden_temp_offset = golden_temp * 500 + lvts_data->temp_offset; From d2278f3533a8c4933c52f85784ffa73e8250c524 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Jun 2024 17:22:25 +0200 Subject: [PATCH 3/5] thermal: core: Synchronize suspend-prepare and post-suspend actions After commit 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") it is theoretically possible that, if a system suspend starts immediately after a system resume, thermal_zone_device_resume() spawned by the thermal PM notifier for one of the thermal zones at the end of the system resume will run after the PM thermal notifier for the suspend-prepare action. If that happens, tz->suspended set by the latter will be reset by the former which may lead to unexpected consequences. To avoid that race, synchronize thermal_zone_device_resume() with the suspend-prepare thermal PM notifier with the help of additional bool field and completion in struct thermal_zone_device. Note that this also ensures running __thermal_zone_device_update() at least once for each thermal zone between system resume and the following system suspend in case it is needed to start thermal mitigation. Fixes: 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 21 +++++++++++++++++++++ drivers/thermal/thermal_core.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 30567b499455..f92529fb0d10 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1397,6 +1397,7 @@ thermal_zone_device_register_with_trips(const char *type, ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); + init_completion(&tz->resume); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; @@ -1642,6 +1643,9 @@ static void thermal_zone_device_resume(struct work_struct *work) thermal_zone_device_init(tz); __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); + complete(&tz->resume); + tz->resuming = false; + mutex_unlock(&tz->lock); } @@ -1659,6 +1663,20 @@ static int thermal_pm_notify(struct notifier_block *nb, list_for_each_entry(tz, &thermal_tz_list, node) { mutex_lock(&tz->lock); + if (tz->resuming) { + /* + * thermal_zone_device_resume() queued up for + * this zone has not acquired the lock yet, so + * release it to let the function run and wait + * util it has done the work. + */ + mutex_unlock(&tz->lock); + + wait_for_completion(&tz->resume); + + mutex_lock(&tz->lock); + } + tz->suspended = true; mutex_unlock(&tz->lock); @@ -1676,6 +1694,9 @@ static int thermal_pm_notify(struct notifier_block *nb, cancel_delayed_work(&tz->poll_queue); + reinit_completion(&tz->resume); + tz->resuming = true; + /* * Replace the work function with the resume one, which * will restore the original work function and schedule diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 20e7b45673d6..66f67e54e0c8 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -55,6 +55,7 @@ struct thermal_governor { * @type: the thermal zone device type * @device: &struct device for this thermal zone * @removal: removal completion + * @resume: resume completion * @trip_temp_attrs: attributes for trip points for sysfs: trip temperature * @trip_type_attrs: attributes for trip points for sysfs: trip type * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis @@ -89,6 +90,7 @@ struct thermal_governor { * @poll_queue: delayed work for polling * @notify_event: Last notification event * @suspended: thermal zone suspend indicator + * @resuming: indicates whether or not thermal zone resume is in progress * @trips: array of struct thermal_trip objects */ struct thermal_zone_device { @@ -96,6 +98,7 @@ struct thermal_zone_device { char type[THERMAL_NAME_LENGTH]; struct device device; struct completion removal; + struct completion resume; struct attribute_group trips_attribute_group; struct thermal_attr *trip_temp_attrs; struct thermal_attr *trip_type_attrs; @@ -123,6 +126,7 @@ struct thermal_zone_device { struct delayed_work poll_queue; enum thermal_notify_event notify_event; bool suspended; + bool resuming; #ifdef CONFIG_THERMAL_DEBUGFS struct thermal_debugfs *debugfs; #endif From 494c7d055081da066424706b28faa9a4c719d852 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 14 Jun 2024 17:26:00 +0200 Subject: [PATCH 4/5] thermal: core: Change PM notifier priority to the minimum It is reported that commit 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") causes battery data in sysfs on Thinkpad P1 Gen2 to become invalid after a resume from S3 (and it is necessary to reboot the machine to restore correct battery data). Some investigation into the problem indicated that it happened because, after the commit in question, the ACPI battery PM notifier ran in parallel with thermal_zone_device_resume() for one of the thermal zones which apparently confused the platform firmware on the affected system. While the exact reason for the firmware confusion remains unclear, it is arguably not particularly relevant, and the expected behavior of the affected system can be restored by making the thermal PM notifier run at the lowest priority which avoids interference between work items spawned by it and the other PM notifiers (that will run before those work items now). Fixes: 5a5efdaffda5 ("thermal: core: Resume thermal zones asynchronously") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218881 Reported-by: fhortner@yahoo.de Tested-by: fhortner@yahoo.de Cc: 6.8+ # 6.8+ Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index f92529fb0d10..f7c38c0d6199 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1721,6 +1721,12 @@ static int thermal_pm_notify(struct notifier_block *nb, static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, + /* + * Run at the lowest priority to avoid interference between the thermal + * zone resume work items spawned by thermal_pm_notify() and the other + * PM notifiers. + */ + .priority = INT_MIN, }; static int __init thermal_init(void) From 096597cfe4ea08b1830e775436d76d7c9d6d3037 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 18 Jun 2024 21:44:24 -0700 Subject: [PATCH 5/5] thermal: int340x: processor_thermal: Support shared interrupts On some systems the processor thermal device interrupt is shared with other PCI devices. In this case return IRQ_NONE from the interrupt handler when the interrupt is not for the processor thermal device. Signed-off-by: Srinivas Pandruvada Fixes: f0658708e863 ("thermal: int340x: processor_thermal: Use non MSI interrupts by default") Cc: 6.7+ # 6.7+ Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index 14e34eabc419..4a1bfebb1b8e 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -150,7 +150,7 @@ static irqreturn_t proc_thermal_irq_handler(int irq, void *devid) { struct proc_thermal_pci *pci_info = devid; struct proc_thermal_device *proc_priv; - int ret = IRQ_HANDLED; + int ret = IRQ_NONE; u32 status; proc_priv = pci_info->proc_priv; @@ -175,6 +175,7 @@ static irqreturn_t proc_thermal_irq_handler(int irq, void *devid) /* Disable enable interrupt flag */ proc_thermal_mmio_write(pci_info, PROC_THERMAL_MMIO_INT_ENABLE_0, 0); pkg_thermal_schedule_work(&pci_info->work); + ret = IRQ_HANDLED; } pci_write_config_byte(pci_info->pdev, 0xdc, 0x01);