diff options
author | K Prateek Nayak <kprateek.nayak@amd.com> | 2023-05-17 22:57:44 +0530 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2023-05-23 16:10:13 -0300 |
commit | aab667ca8837e45fda0204bed7b59abd634c0b2b (patch) | |
tree | bb014c00c3dd53469d10845df946cb9e1aae5442 /tools/perf/builtin-stat.c | |
parent | 4b87406a3b590888edf02705a815eb62e122e9ba (diff) |
perf stat: Add "--per-cache" aggregation option and document it
This patch adds support for "--per-cache" option for aggregation at a
particular cache level and documents the same.
Following is the output of 'perf stat' with aggregation at L3 for the
event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd
Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running
hackbench pinned to 4 LLCs:
$ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
taskset -c 0-15,64-79,128-143,192-207 \
perf bench sched messaging -p -t -l 100000 -g 8
...
Performance counter stats for 'system wide':
S0-D0-L3-ID0 16 9,500,803 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID8 16 6,338,099 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID16 16 355,005 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID24 16 22,067 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID32 16 16,321 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID40 16 11,619 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID48 16 4,238 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID56 16 31,158 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID64 16 28,242,452 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID72 16 22,906,973 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID80 16 72,898 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID88 16 56,907 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID96 16 20,456 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID104 16 40,913 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID112 16 78,113 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID120 16 37,897 ls_dmnd_fills_from_sys.ext_cache_remote
Also support 'perf stat record' and 'perf stat report' with the ability
to specify a different cache level to aggregate data at when running
'perf stat report'.
$ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
taskset -c 0-15,64-79,128-143,192-207 \
perf bench sched messaging -p -t -l 100000 -g 8
...
Performance counter stats for 'system wide':
S0-D0-L2-ID0 2 1,442,061 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID1 2 1,548,994 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID2 2 1,553,557 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID3 2 1,420,122 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID4 2 1,465,461 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID5 2 1,455,153 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID6 2 1,595,237 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID7 2 1,499,321 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L2-ID8 2 1,919,025 ls_dmnd_fills_from_sys.ext_cache_remote
...
S1-D1-L2-ID127 2 21,295 ls_dmnd_fills_from_sys.ext_cache_remote
$ sudo perf stat report --per-cache=L3
Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\
taskset -c 0-15,64-79,128-143,192-207 \
perf bench sched messaging -p -t -l 100000 -g 8':
S0-D0-L3-ID0 16 11,979,906 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID8 16 14,257,202 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID16 16 377,484 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID24 16 27,224 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID32 16 26,816 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID40 16 14,461 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID48 16 10,499 ls_dmnd_fills_from_sys.ext_cache_remote
S0-D0-L3-ID56 16 53,817 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID64 16 27,361,987 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID72 16 37,299,024 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID80 16 84,125 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID88 16 64,561 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID96 16 13,403 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID104 16 20,138 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID112 16 93,220 ls_dmnd_fills_from_sys.ext_cache_remote
S1-D1-L3-ID120 16 35,465 ls_dmnd_fills_from_sys.ext_cache_remote
On the above system, the domain covered by S0-D0-L3-ID0 contains
S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is
equal to the sum of counts for L2-ID0 to L2-ID7.
Add documentation for the newly introduced "--per-cache" option.
Suggested-by: Gautham Shenoy <gautham.shenoy@amd.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wen Pu <puwen@hygon.cn>
Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/builtin-stat.c')
-rw-r--r-- | tools/perf/builtin-stat.c | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0528d1bc15d2..176deeb8ee66 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt, return 0; } +static int parse_cache_level(const struct option *opt, + const char *str, + int unset __maybe_unused) +{ + int level; + u32 *aggr_mode = (u32 *)opt->value; + u32 *aggr_level = (u32 *)opt->data; + + /* + * If no string is specified, aggregate based on the topology of + * Last Level Cache (LLC). Since the LLC level can change from + * architecture to architecture, set level greater than + * MAX_CACHE_LVL which will be interpreted as LLC. + */ + if (str == NULL) { + level = MAX_CACHE_LVL + 1; + goto out; + } + + /* + * The format to specify cache level is LX or lX where X is the + * cache level. + */ + if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) { + pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n", + MAX_CACHE_LVL, + MAX_CACHE_LVL); + return -EINVAL; + } + + level = atoi(&str[1]); + if (level < 1) { + pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n", + MAX_CACHE_LVL, + MAX_CACHE_LVL); + return -EINVAL; + } + + if (level > MAX_CACHE_LVL) { + pr_err("perf only supports max cache level of %d.\n" + "Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL); + return -EINVAL; + } +out: + *aggr_mode = AGGR_CACHE; + *aggr_level = level; + return 0; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1190,6 +1239,9 @@ static struct option stat_options[] = { "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode, "aggregate counts per processor die", AGGR_DIE), + OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level, + "cache level", "aggregate count at this cache level (Default: LLC)", + parse_cache_level), OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode, @@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv) "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode, "aggregate counts per processor die", AGGR_DIE), + OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level, + "cache level", + "aggregate count at this cache level (Default: LLC)", + parse_cache_level), OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode, |