summaryrefslogtreecommitdiff
path: root/tools/perf/builtin-stat.c
diff options
context:
space:
mode:
authorK Prateek Nayak <kprateek.nayak@amd.com>2023-05-17 22:57:44 +0530
committerArnaldo Carvalho de Melo <acme@redhat.com>2023-05-23 16:10:13 -0300
commitaab667ca8837e45fda0204bed7b59abd634c0b2b (patch)
treebb014c00c3dd53469d10845df946cb9e1aae5442 /tools/perf/builtin-stat.c
parent4b87406a3b590888edf02705a815eb62e122e9ba (diff)
perf stat: Add "--per-cache" aggregation option and document it
This patch adds support for "--per-cache" option for aggregation at a particular cache level and documents the same. Following is the output of 'perf stat' with aggregation at L3 for the event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running hackbench pinned to 4 LLCs: $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L3-ID0 16 9,500,803 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 6,338,099 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 355,005 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 22,067 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 16,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 11,619 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 4,238 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 31,158 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 28,242,452 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 22,906,973 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 72,898 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 56,907 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 20,456 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 40,913 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 78,113 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 37,897 ls_dmnd_fills_from_sys.ext_cache_remote Also support 'perf stat record' and 'perf stat report' with the ability to specify a different cache level to aggregate data at when running 'perf stat report'. $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L2-ID0 2 1,442,061 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID1 2 1,548,994 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID2 2 1,553,557 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID3 2 1,420,122 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID4 2 1,465,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID5 2 1,455,153 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID6 2 1,595,237 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID7 2 1,499,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID8 2 1,919,025 ls_dmnd_fills_from_sys.ext_cache_remote ... S1-D1-L2-ID127 2 21,295 ls_dmnd_fills_from_sys.ext_cache_remote $ sudo perf stat report --per-cache=L3 Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8': S0-D0-L3-ID0 16 11,979,906 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 14,257,202 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 377,484 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 27,224 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 26,816 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 14,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 10,499 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 53,817 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 27,361,987 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 37,299,024 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 84,125 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 64,561 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 13,403 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 20,138 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 93,220 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 35,465 ls_dmnd_fills_from_sys.ext_cache_remote On the above system, the domain covered by S0-D0-L3-ID0 contains S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is equal to the sum of counts for L2-ID0 to L2-ID7. Add documentation for the newly introduced "--per-cache" option. Suggested-by: Gautham Shenoy <gautham.shenoy@amd.com> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Acked-by: Ian Rogers <irogers@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ravi Bangoria <ravi.bangoria@amd.com> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Wen Pu <puwen@hygon.cn> Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf/builtin-stat.c')
-rw-r--r--tools/perf/builtin-stat.c56
1 files changed, 56 insertions, 0 deletions
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 0528d1bc15d2..176deeb8ee66 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt,
return 0;
}
+static int parse_cache_level(const struct option *opt,
+ const char *str,
+ int unset __maybe_unused)
+{
+ int level;
+ u32 *aggr_mode = (u32 *)opt->value;
+ u32 *aggr_level = (u32 *)opt->data;
+
+ /*
+ * If no string is specified, aggregate based on the topology of
+ * Last Level Cache (LLC). Since the LLC level can change from
+ * architecture to architecture, set level greater than
+ * MAX_CACHE_LVL which will be interpreted as LLC.
+ */
+ if (str == NULL) {
+ level = MAX_CACHE_LVL + 1;
+ goto out;
+ }
+
+ /*
+ * The format to specify cache level is LX or lX where X is the
+ * cache level.
+ */
+ if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) {
+ pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
+ MAX_CACHE_LVL,
+ MAX_CACHE_LVL);
+ return -EINVAL;
+ }
+
+ level = atoi(&str[1]);
+ if (level < 1) {
+ pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
+ MAX_CACHE_LVL,
+ MAX_CACHE_LVL);
+ return -EINVAL;
+ }
+
+ if (level > MAX_CACHE_LVL) {
+ pr_err("perf only supports max cache level of %d.\n"
+ "Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL);
+ return -EINVAL;
+ }
+out:
+ *aggr_mode = AGGR_CACHE;
+ *aggr_level = level;
+ return 0;
+}
+
static struct option stat_options[] = {
OPT_BOOLEAN('T', "transaction", &transaction_run,
"hardware transaction statistics"),
@@ -1190,6 +1239,9 @@ static struct option stat_options[] = {
"aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
"aggregate counts per processor die", AGGR_DIE),
+ OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level,
+ "cache level", "aggregate count at this cache level (Default: LLC)",
+ parse_cache_level),
OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
@@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv)
"aggregate counts per processor socket", AGGR_SOCKET),
OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
"aggregate counts per processor die", AGGR_DIE),
+ OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
+ "cache level",
+ "aggregate count at this cache level (Default: LLC)",
+ parse_cache_level),
OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
"aggregate counts per physical processor core", AGGR_CORE),
OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,