From 23d2626b841c2adccdeb477665313c02dff02dc3 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 14 Sep 2023 19:36:04 +0530 Subject: perf/x86/amd/core: Fix overflow reset on hotplug Kernels older than v5.19 do not support PerfMonV2 and the PMI handler does not clear the overflow bits of the PerfCntrGlobalStatus register. Because of this, loading a recent kernel using kexec from an older kernel can result in inconsistent register states on Zen 4 systems. The PMI handler of the new kernel gets confused and shows a warning when an overflow occurs because some of the overflow bits are set even if the corresponding counters are inactive. These are remnants from overflows that were handled by the older kernel. During CPU hotplug, the PerfCntrGlobalCtl and PerfCntrGlobalStatus registers should always be cleared for PerfMonV2-capable processors. However, a condition used for NB event constaints applicable only to older processors currently prevents this from happening. Move the reset sequence to an appropriate place and also clear the LBR Freeze bit. Fixes: 21d59e3e2c40 ("perf/x86/amd/core: Detect PerfMonV2 support") Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/882a87511af40792ba69bb0e9026f19a2e71e8a3.1694696888.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index abadd5f23425..ed626bfa1eed 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -534,8 +534,12 @@ static void amd_pmu_cpu_reset(int cpu) /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); - /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ - wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); + /* + * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze + * and PerfCntrGLobalStatus.PerfCntrOvfl + */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); } static int amd_pmu_cpu_prepare(int cpu) @@ -570,6 +574,7 @@ static void amd_pmu_cpu_starting(int cpu) int i, nb_id; cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -591,8 +596,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) @@ -601,6 +604,7 @@ static void amd_pmu_cpu_dead(int cpu) kfree(cpuhw->lbr_sel); cpuhw->lbr_sel = NULL; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -613,8 +617,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - amd_pmu_cpu_reset(cpu); } static inline void amd_pmu_set_global_ctl(u64 ctl) -- cgit v1.2.3 From 599522d9d2e19d6240e4312577f1c5f3ffca22f6 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 14 Sep 2023 19:58:40 +0530 Subject: perf/x86/amd: Do not WARN() on every IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zen 4 systems running buggy microcode can hit a WARN_ON() in the PMI handler, as shown below, several times while perf runs. A simple `perf top` run is enough to render the system unusable: WARNING: CPU: 18 PID: 20608 at arch/x86/events/amd/core.c:944 amd_pmu_v2_handle_irq+0x1be/0x2b0 This happens because the Performance Counter Global Status Register (PerfCntGlobalStatus) has one or more bits set which are considered reserved according to the "AMD64 Architecture Programmer’s Manual, Volume 2: System Programming, 24593": https://www.amd.com/system/files/TechDocs/24593.pdf To make this less intrusive, warn just once if any reserved bit is set and prompt the user to update the microcode. Also sanitize the value to what the code is handling, so that the overflow events continue to be handled for the number of counters that are known to be sane. Going forward, the following microcode patch levels are recommended for Zen 4 processors in order to avoid such issues with reserved bits: Family=0x19 Model=0x11 Stepping=0x01: Patch=0x0a10113e Family=0x19 Model=0x11 Stepping=0x02: Patch=0x0a10123e Family=0x19 Model=0xa0 Stepping=0x01: Patch=0x0aa00116 Family=0x19 Model=0xa0 Stepping=0x02: Patch=0x0aa00212 Commit f2eb058afc57 ("linux-firmware: Update AMD cpu microcode") from the linux-firmware tree has binaries that meet the minimum required patch levels. [ sandipan: - add message to prompt users to update microcode - rework commit message and call out required microcode levels ] Fixes: 7685665c390d ("perf/x86/amd/core: Add PerfMonV2 overflow handling") Reported-by: Jirka Hladky Signed-off-by: Breno Leitao Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/3540f985652f41041e54ee82aa53e7dbd55739ae.1694696888.git.sandipan.das@amd.com/ --- arch/x86/events/amd/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index ed626bfa1eed..e24976593a29 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -886,7 +886,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) struct hw_perf_event *hwc; struct perf_event *event; int handled = 0, idx; - u64 status, mask; + u64 reserved, status, mask; bool pmu_enabled; /* @@ -911,6 +911,14 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) status &= ~GLOBAL_STATUS_LBRS_FROZEN; } + reserved = status & ~amd_pmu_global_cntr_mask; + if (reserved) + pr_warn_once("Reserved PerfCntrGlobalStatus bits are set (0x%llx), please consider updating microcode\n", + reserved); + + /* Clear any reserved bits set by buggy microcode */ + status &= amd_pmu_global_cntr_mask; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; -- cgit v1.2.3