diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-30 17:56:53 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-30 17:56:53 -1000 |
commit | 9a0f53e0cfc2ef262c05b8e4ab89e7f2accaf96c (patch) | |
tree | d5d05ad14bb222c7d20f96e5738eb8ff95a797a3 | |
parent | 6750f0de53b7d64a4deffa62944ea431a153ec48 (diff) | |
parent | 94b3f0b5af2c7af69e3d6e0cdd9b0ea535f22186 (diff) |
Merge tag 'csd-lock.2023.10.23a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu
Pull CSD lock update from Paul McKenney:
"This adds a kernel boot parameter that causes the kernel to panic if
one of the call_smp_function() APIs is stalled for more than the
specified duration.
This is useful in deployments in which a clean panic is preferable to
an indefinite stall"
* tag 'csd-lock.2023.10.23a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu:
smp,csd: Throw an error if a CSD lock is stuck for too long
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 7 | ||||
-rw-r--r-- | kernel/smp.c | 13 |
2 files changed, 19 insertions, 1 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 45e34be4ed56..582f7a395153 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5864,6 +5864,13 @@ This feature may be more efficiently disabled using the csdlock_debug- kernel parameter. + smp.panic_on_ipistall= [KNL] + If a csd_lock_timeout extends for more than + the specified number of milliseconds, panic the + system. By default, let CSD-lock acquisition + take as long as they take. Specifying 300,000 + for this value provides a 5-minute timeout. + smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port smsc-ircc2.ircc_sir= [HW] SIR base I/O port diff --git a/kernel/smp.c b/kernel/smp.c index 8c714583786b..f085ebcdf9e7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -170,6 +170,8 @@ static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0444); +static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ +module_param(panic_on_ipistall, int, 0444); static atomic_t csd_bug_count = ATOMIC_INIT(0); @@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in } ts2 = sched_clock(); + /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; @@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + /* How long since this CSD lock was stuck. */ + ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, cpu, csd->func, csd->info); + /* + * If the CSD lock is still stuck after 5 minutes, it is unlikely + * to become unstuck. Use a signed comparison to avoid triggering + * on underflows when the TSC is out of sync between sockets. + */ + BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), |