summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-27 10:44:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-27 10:44:49 -0700
commit1fcaa5db40f960e58f47050337db54eb087fb62a (patch)
tree42b491512906f01d71795c4c78f45208c5feee79
parent7b0acd911ca05b2555d834cd93ffcfab1ade828c (diff)
parentf7c1b0e4ae47e67c6f9af84568a5f4a80638ccd8 (diff)
Merge tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull thermal control fix from Rafael Wysocki: "Prevent the thermal core from flooding the kernel log with useless messages if thermal zone temperature can never be determined (or its sensor has failed permanently) and make it finally give up and disable defective thermal zones (Rafael Wysocki)" * tag 'thermal-6.11-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: thermal: core: Back off when polling thermal zones on errors thermal: trip: Split thermal_zone_device_set_mode()
-rw-r--r--drivers/thermal/thermal_core.c89
-rw-r--r--drivers/thermal/thermal_core.h10
2 files changed, 85 insertions, 14 deletions
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index f6e700e48aad..95c399f94744 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -272,6 +272,44 @@ static int __init thermal_register_governors(void)
return ret;
}
+static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
+ enum thermal_device_mode mode)
+{
+ if (tz->ops.change_mode) {
+ int ret;
+
+ ret = tz->ops.change_mode(tz, mode);
+ if (ret)
+ return ret;
+ }
+
+ tz->mode = mode;
+
+ return 0;
+}
+
+static void thermal_zone_broken_disable(struct thermal_zone_device *tz)
+{
+ struct thermal_trip_desc *td;
+
+ dev_err(&tz->device, "Unable to get temperature, disabling!\n");
+ /*
+ * This function only runs for enabled thermal zones, so no need to
+ * check for the current mode.
+ */
+ __thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED);
+ thermal_notify_tz_disable(tz);
+
+ for_each_trip_desc(tz, td) {
+ if (td->trip.type == THERMAL_TRIP_CRITICAL &&
+ td->trip.temperature > THERMAL_TEMP_INVALID) {
+ dev_crit(&tz->device,
+ "Disabled thermal zone with critical trip point\n");
+ return;
+ }
+ }
+}
+
/*
* Zone update section: main control loop applied to each zone while monitoring
* in polling mode. The monitoring is done using a workqueue.
@@ -292,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
cancel_delayed_work(&tz->poll_queue);
}
+static void thermal_zone_recheck(struct thermal_zone_device *tz, int error)
+{
+ if (error == -EAGAIN) {
+ thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY);
+ return;
+ }
+
+ /*
+ * Print the message once to reduce log noise. It will be followed by
+ * another one if the temperature cannot be determined after multiple
+ * attempts.
+ */
+ if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY)
+ dev_info(&tz->device, "Temperature check failed (%d)\n", error);
+
+ thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies);
+
+ tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL);
+ if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) {
+ thermal_zone_broken_disable(tz);
+ /*
+ * Restore the original recheck delay value to allow the thermal
+ * zone to try to recover when it is reenabled by user space.
+ */
+ tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
+ }
+}
+
static void monitor_thermal_zone(struct thermal_zone_device *tz)
{
if (tz->mode != THERMAL_DEVICE_ENABLED)
@@ -491,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
ret = __thermal_zone_get_temp(tz, &temp);
if (ret) {
- if (ret != -EAGAIN)
- dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
-
- thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
+ thermal_zone_recheck(tz, ret);
return;
} else if (temp <= THERMAL_TEMP_INVALID) {
/*
@@ -506,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
goto monitor;
}
+ tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
+
tz->last_temperature = tz->temperature;
tz->temperature = temp;
@@ -540,7 +605,7 @@ monitor:
static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
enum thermal_device_mode mode)
{
- int ret = 0;
+ int ret;
mutex_lock(&tz->lock);
@@ -548,14 +613,15 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
if (mode == tz->mode) {
mutex_unlock(&tz->lock);
- return ret;
+ return 0;
}
- if (tz->ops.change_mode)
- ret = tz->ops.change_mode(tz, mode);
+ ret = __thermal_zone_device_set_mode(tz, mode);
+ if (ret) {
+ mutex_unlock(&tz->lock);
- if (!ret)
- tz->mode = mode;
+ return ret;
+ }
__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
@@ -566,7 +632,7 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
else
thermal_notify_tz_disable(tz);
- return ret;
+ return 0;
}
int thermal_zone_device_enable(struct thermal_zone_device *tz)
@@ -1445,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type,
thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
+ tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
/* sys I/F */
/* Add nodes that are always present via .groups */
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index ba8e6fc807ca..4cf2b7230d04 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -67,6 +67,8 @@ struct thermal_governor {
* @polling_delay_jiffies: number of jiffies to wait between polls when
* checking whether trip points have been crossed (0 for
* interrupt driven systems)
+ * @recheck_delay_jiffies: delay after a failed attempt to determine the zone
+ * temperature before trying again
* @temperature: current temperature. This is only for core code,
* drivers should use thermal_zone_get_temp() to get the
* current temperature
@@ -108,6 +110,7 @@ struct thermal_zone_device {
int num_trips;
unsigned long passive_delay_jiffies;
unsigned long polling_delay_jiffies;
+ unsigned long recheck_delay_jiffies;
int temperature;
int last_temperature;
int emul_temperature;
@@ -137,10 +140,11 @@ struct thermal_zone_device {
#define THERMAL_TEMP_INIT INT_MIN
/*
- * Default delay after a failing thermal zone temperature check before
- * attempting to check it again.
+ * Default and maximum delay after a failed thermal zone temperature check
+ * before attempting to check it again (in jiffies).
*/
-#define THERMAL_RECHECK_DELAY_MS 250
+#define THERMAL_RECHECK_DELAY msecs_to_jiffies(250)
+#define THERMAL_MAX_RECHECK_DELAY (120 * HZ)
/* Default Thermal Governor */
#if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)