diff options
author | Kan Liang <kan.liang@linux.intel.com> | 2023-01-25 12:49:25 -0800 |
---|---|---|
committer | Peter Zijlstra <peterz@infradead.org> | 2023-02-11 11:18:12 +0100 |
commit | 89e97eb8cec0f1af5ebf2380308913256ca7915a (patch) | |
tree | 9eebd90207fce9623af3d3cf6dee623578dc4e26 /arch | |
parent | 5d515ee40cb57ea5331998f27df7946a69f14dc3 (diff) |
perf/x86/intel/ds: Fix the conversion from TSC to perf time
The time order is incorrect when the TSC in a PEBS record is used.
$perf record -e cycles:upp dd if=/dev/zero of=/dev/null
count=10000
$ perf script --show-task-events
perf-exec 0 0.000000: PERF_RECORD_COMM: perf-exec:915/915
dd 915 106.479872: PERF_RECORD_COMM exec: dd:915/915
dd 915 106.483270: PERF_RECORD_EXIT(915:915):(914:914)
dd 915 106.512429: 1 cycles:upp:
ffffffff96c011b7 [unknown] ([unknown])
... ...
The perf time is from sched_clock_cpu(). The current PEBS code
unconditionally convert the TSC to native_sched_clock(). There is a
shift between the two clocks. If the TSC is stable, the shift is
consistent, __sched_clock_offset. If the TSC is unstable, the shift has
to be calculated at runtime.
This patch doesn't support the conversion when the TSC is unstable. The
TSC unstable case is a corner case and very unlikely to happen. If it
happens, the TSC in a PEBS record will be dropped and fall back to
perf_event_clock().
Fixes: 47a3aeb39e8d ("perf/x86/intel/pebs: Fix PEBS timestamps overwritten")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/CAM9d7cgWDVAq8-11RbJ2uGfwkKD6fA-OMwOKDrNUrU_=8MgEjg@mail.gmail.com/
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/events/intel/ds.c | 35 |
1 files changed, 26 insertions, 9 deletions
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 183efa914b99..b0354dc869d2 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2,12 +2,14 @@ #include <linux/bitops.h> #include <linux/types.h> #include <linux/slab.h> +#include <linux/sched/clock.h> #include <asm/cpu_entry_area.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/insn.h> #include <asm/io.h> +#include <asm/timer.h> #include "../perf_event.h" @@ -1568,6 +1570,27 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +static void setup_pebs_time(struct perf_event *event, + struct perf_sample_data *data, + u64 tsc) +{ + /* Converting to a user-defined clock is not supported yet. */ + if (event->attr.use_clockid != 0) + return; + + /* + * Doesn't support the conversion when the TSC is unstable. + * The TSC unstable case is a corner case and very unlikely to + * happen. If it happens, the TSC in a PEBS record will be + * dropped and fall back to perf_event_clock(). + */ + if (!using_native_sched_clock() || !sched_clock_stable()) + return; + + data->time = native_sched_clock_from_tsc(tsc) + __sched_clock_offset; + data->sample_flags |= PERF_SAMPLE_TIME; +} + #define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_DATA_PAGE_SIZE) @@ -1715,11 +1738,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * * We can only do this for the default trace clock. */ - if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) { - data->time = native_sched_clock_from_tsc(pebs->tsc); - data->sample_flags |= PERF_SAMPLE_TIME; - } + if (x86_pmu.intel_cap.pebs_format >= 3) + setup_pebs_time(event, data, pebs->tsc); if (has_branch_stack(event)) perf_sample_save_brstack(data, event, &cpuc->lbr_stack); @@ -1781,10 +1801,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; - if (event->attr.use_clockid == 0) { - data->time = native_sched_clock_from_tsc(basic->tsc); - data->sample_flags |= PERF_SAMPLE_TIME; - } + setup_pebs_time(event, data, basic->tsc); /* * We must however always use iregs for the unwinder to stay sane; the |