305 files changed, 11094 insertions, 6539 deletions
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index fd7a6ff9e7aa..05806ecfc33c 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -39,7 +39,12 @@ trace/beauty/generated/
 pmu-events/pmu-events.c
 pmu-events/jevents
 feature/
+libapi/
 libbpf/
+libperf/
+libsubcmd/
+libsymbol/
+libtraceevent/
+libtraceevent_plugins/
 fixdep
-libtraceevent-dynamic-list
 Documentation/doc.dep
diff --git a/tools/perf/Build b/tools/perf/Build
index 496b096153bb..6dd67e502295 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -5,7 +5,6 @@ perf-y += builtin-diff.o
 perf-y += builtin-evlist.o
 perf-y += builtin-ftrace.o
 perf-y += builtin-help.o
-perf-y += builtin-sched.o
 perf-y += builtin-buildid-list.o
 perf-y += builtin-buildid-cache.o
 perf-y += builtin-kallsyms.o
@@ -13,11 +12,8 @@ perf-y += builtin-list.o
 perf-y += builtin-record.o
 perf-y += builtin-report.o
 perf-y += builtin-stat.o
-perf-y += builtin-timechart.o
 perf-y += builtin-top.o
 perf-y += builtin-script.o
-perf-y += builtin-kmem.o
-perf-y += builtin-lock.o
 perf-y += builtin-kvm.o
 perf-y += builtin-inject.o
 perf-y += builtin-mem.o
@@ -25,9 +21,18 @@ perf-y += builtin-data.o
 perf-y += builtin-version.o
 perf-y += builtin-c2c.o
 perf-y += builtin-daemon.o
-perf-y += builtin-kwork.o
 
-perf-$(CONFIG_TRACE) += builtin-trace.o
+perf-$(CONFIG_LIBTRACEEVENT) += builtin-kmem.o
+perf-$(CONFIG_LIBTRACEEVENT) += builtin-kwork.o
+perf-$(CONFIG_LIBTRACEEVENT) += builtin-lock.o
+perf-$(CONFIG_LIBTRACEEVENT) += builtin-sched.o
+perf-$(CONFIG_LIBTRACEEVENT) += builtin-timechart.o
+
+ifeq ($(CONFIG_LIBTRACEEVENT),y)
+  perf-$(CONFIG_TRACE) += builtin-trace.o
+  perf-$(CONFIG_TRACE) += trace/beauty/
+endif
+
 perf-$(CONFIG_LIBELF) += builtin-probe.o
 
 perf-y += bench/
@@ -51,7 +56,6 @@ CFLAGS_builtin-report.o	   += -DDOCDIR="BUILD_STR($(srcdir_SQ)/Documentation)"
 perf-y += util/
 perf-y += arch/
 perf-y += ui/
-perf-y += scripts/
-perf-$(CONFIG_TRACE) += trace/beauty/
+perf-$(CONFIG_LIBTRACEEVENT) += scripts/
 
 gtk-y += ui/gtk/
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index 18fcc52809fb..980fe2c29275 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -41,7 +41,7 @@ OPTIONS
 
 -q::
 --quiet::
-	Do not show any message.  (Suppress -v)
+	Do not show any warnings or messages.  (Suppress -v)
 
 -n::
 --show-nr-samples::
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index be65bd55ab2a..f3067a4af294 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -75,7 +75,7 @@ OPTIONS
 
 -q::
 --quiet::
-	Do not show any message.  (Suppress -v)
+	Do not show any warnings or messages.  (Suppress -v)
 
 -f::
 --force::
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index 92464a5d7eaf..7b6ccd2fa3bf 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -189,8 +189,16 @@ There is also script intel-pt-events.py which provides an example of how to
 unpack the raw data for power events and PTWRITE. The script also displays
 branches, and supports 2 additional modes selected by option:
 
- --insn-trace - instruction trace
- --src-trace - source trace
+ - --insn-trace - instruction trace
+ - --src-trace - source trace
+
+The intel-pt-events.py script also has options:
+
+ - --all-switch-events - display all switch events, not only the last consecutive.
+ - --interleave [<n>] - interleave sample output for the same timestamp so that
+ no more than n samples for a CPU are displayed in a row. 'n' defaults to 4.
+ Note this only affects the order of output, and only when the timestamp is the
+ same.
 
 As mentioned above, it is easy to capture too much data.  One way to limit the
 data captured is to use 'snapshot' mode which is explained further below.
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 57384a97c04f..c5a3cb0f57c7 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -39,9 +39,13 @@ any extra expressions computed by perf stat.
 --deprecated::
 Print deprecated events. By default the deprecated events are hidden.
 
---cputype::
-Print events applying cpu with this type for hybrid platform
-(e.g. --cputype core or --cputype atom)
+--unit::
+Print PMU events and metrics limited to the specific PMU name.
+(e.g. --unit cpu, --unit msr, --unit cpu_core, --unit cpu_atom)
+
+-j::
+--json::
+Output in JSON format.
 
 [[EVENT_MODIFIERS]]
 EVENT MODIFIERS
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index 3b1e16563b79..38e79d45e426 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -42,7 +42,7 @@ COMMON OPTIONS
 
 -q::
 --quiet::
-	Do not show any message. (Suppress -v)
+	Do not show any warnings or messages. (Suppress -v)
 
 -D::
 --dump-raw-trace::
@@ -168,6 +168,10 @@ CONTENTION OPTIONS
 --entries=<value>::
 	Display this many entries.
 
+-l::
+--lock-addr::
+	Show lock contention stat by address
+
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 080981d38d7b..7f8e8ba3a787 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -57,7 +57,7 @@ OPTIONS
 
 -q::
 --quiet::
-	Be quiet (do not show any messages including errors).
+	Do not show any warnings or messages.
 	Can not use with -v.
 
 -a::
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index e41ae950fdc3..ff815c2f67e8 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -238,10 +238,6 @@ OPTIONS
 	Also, by adding a comma, the number of mmap pages for AUX
 	area tracing can be specified.
 
---group::
-	Put all events in a single event group.  This precedes the --event
-	option and remains only for backward compatibility.  See --event.
-
 -g::
 	Enables call-graph (stack chain/backtrace) recording for both
 	kernel space and user space.
@@ -282,7 +278,7 @@ OPTIONS
 
 -q::
 --quiet::
-	Don't print any message, useful for scripting.
+	Don't print any warnings or messages, useful for scripting.
 
 -v::
 --verbose::
@@ -388,6 +384,7 @@ following filters are defined:
         - any_call: any function call or system call
         - any_ret: any function return or system call return
         - ind_call: any indirect branch
+        - ind_jmp: any indirect jump
         - call: direct calls, including far (to/from kernel) calls
         - u:  only when the branch target is at the user level
         - k: only when the branch target is in the kernel
@@ -396,6 +393,10 @@ following filters are defined:
 	- no_tx: only when the target is not in a hardware transaction
 	- abort_tx: only when the target is a hardware transaction abort
 	- cond: conditional branches
+	- call_stack: save call stack
+	- no_flags: don't save branch flags e.g prediction, misprediction etc
+	- no_cycles: don't save branch cycles
+	- hw_index: save branch hardware index
 	- save_type: save branch type during sampling in case binary is not available later
 		     For the platforms with Intel Arch LBR support (12th-Gen+ client or
 		     4th-Gen Xeon+ server), the save branch type is unconditionally enabled
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 4533db2ee56b..4fa509b15948 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -27,7 +27,7 @@ OPTIONS
 
 -q::
 --quiet::
-	Do not show any message.  (Suppress -v)
+	Do not show any warnings or messages.  (Suppress -v)
 
 -n::
 --show-nr-samples::
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index d7ff1867feda..18abdc1dce05 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -354,8 +354,8 @@ forbids the event merging logic from sharing events between groups and
 may be used to increase accuracy in this case.
 
 --quiet::
-Don't print output. This is useful with perf stat record below to only
-write data to the perf.data file.
+Don't print output, warnings or messages. This is useful with perf stat
+record below to only write data to the perf.data file.
 
 STAT RECORD
 -----------
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index c1fdba26bf53..e534d709cc5a 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -51,9 +51,6 @@ Default is to monitor all CPUS.
 --count-filter=<count>::
 	Only display functions with more events than this.
 
---group::
-        Put the counters into a counter group.
-
 --group-sort-idx::
 	Sort the output by the event at the index n in group. If n is invalid,
 	sort by the first event. It can support multiple groups with different
@@ -313,10 +310,10 @@ use '-e e1 -e e2 -G foo,foo' or just use '-e e1 -e e2 -G foo'.
 
 		perf top -e cycles,probe:icmp_rcv --switch-on=probe:icmp_rcv
 
-	   Alternatively one can ask for --group and then two overhead columns
+	   Alternatively one can ask for a group and then two overhead columns
            will appear, the first for cycles and the second for the switch-on event.
 
-		perf top --group -e cycles,probe:icmp_rcv --switch-on=probe:icmp_rcv
+		perf top -e '{cycles,probe:icmp_rcv}' --switch-on=probe:icmp_rcv
 
 	This may be interesting to measure a workload only after some initialization
 	phase is over, i.e. insert a perf probe at that point and use the above
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index f5d72f936a6b..1da7f4b91b4f 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -3,7 +3,6 @@ tools/arch
 tools/scripts
 tools/build
 tools/include
-tools/lib/traceevent
 tools/lib/api
 tools/lib/bpf
 tools/lib/subcmd
@@ -13,8 +12,7 @@ tools/lib/ctype.c
 tools/lib/hweight.c
 tools/lib/rbtree.c
 tools/lib/string.c
-tools/lib/symbol/kallsyms.c
-tools/lib/symbol/kallsyms.h
+tools/lib/symbol
 tools/lib/find_bit.c
 tools/lib/bitmap.c
 tools/lib/list_sort.c
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index f3fe360a35c6..75f3f6e0a231 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -25,7 +25,7 @@ unexport MAKEFLAGS
 # (To override it, run 'make JOBS=1' and similar.)
 #
 ifeq ($(JOBS),)
-  JOBS := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null)
+  JOBS := $(shell (getconf _NPROCESSORS_ONLN || grep -E -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null)
   ifeq ($(JOBS),0)
     JOBS := 1
   endif
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 898226ea8cad..83ed969b95b4 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -307,7 +307,7 @@ CORE_CFLAGS += -ggdb3
 CORE_CFLAGS += -funwind-tables
 CORE_CFLAGS += -Wall
 CORE_CFLAGS += -Wextra
-CORE_CFLAGS += -std=gnu99
+CORE_CFLAGS += -std=gnu11
 
 CXXFLAGS += -std=gnu++14 -fno-exceptions -fno-rtti
 CXXFLAGS += -Wall
@@ -349,7 +349,6 @@ ifeq ($(DEBUG),0)
   endif
 endif
 
-INC_FLAGS += -I$(srctree)/tools/lib/perf/include
 INC_FLAGS += -I$(src-perf)/util/include
 INC_FLAGS += -I$(src-perf)/arch/$(SRCARCH)/include
 INC_FLAGS += -I$(srctree)/tools/include/
@@ -367,7 +366,6 @@ endif
 
 INC_FLAGS += -I$(src-perf)/util
 INC_FLAGS += -I$(src-perf)
-INC_FLAGS += -I$(srctree)/tools/lib/
 
 CORE_CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
 
@@ -765,18 +763,20 @@ ifndef NO_LIBUNWIND
   EXTLIBS += $(EXTLIBS_LIBUNWIND)
 endif
 
-ifeq ($(NO_SYSCALL_TABLE),0)
-  $(call detected,CONFIG_TRACE)
-else
-  ifndef NO_LIBAUDIT
-    $(call feature_check,libaudit)
-    ifneq ($(feature-libaudit), 1)
-      msg := $(warning No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev);
-      NO_LIBAUDIT := 1
-    else
-      CFLAGS += -DHAVE_LIBAUDIT_SUPPORT
-      EXTLIBS += -laudit
-      $(call detected,CONFIG_TRACE)
+ifneq ($(NO_LIBTRACEEVENT),1)
+  ifeq ($(NO_SYSCALL_TABLE),0)
+    $(call detected,CONFIG_TRACE)
+  else
+    ifndef NO_LIBAUDIT
+      $(call feature_check,libaudit)
+      ifneq ($(feature-libaudit), 1)
+        msg := $(warning No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev);
+        NO_LIBAUDIT := 1
+      else
+        CFLAGS += -DHAVE_LIBAUDIT_SUPPORT
+        EXTLIBS += -laudit
+        $(call detected,CONFIG_TRACE)
+      endif
     endif
   endif
 endif
@@ -871,6 +871,7 @@ define disable-python_code
   NO_LIBPYTHON := 1
 endef
 
+PYTHON_EXTENSION_SUFFIX := '.so'
 ifdef NO_LIBPYTHON
   $(call disable-python,Python support disabled by user)
 else
@@ -889,7 +890,8 @@ else
       else
          LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
          EXTLIBS += $(PYTHON_EMBED_LIBADD)
-         LANG_BINDINGS += $(obj-perf)python/perf.so
+         PYTHON_EXTENSION_SUFFIX := $(shell $(PYTHON) -c 'from importlib import machinery; print(machinery.EXTENSION_SUFFIXES[0])')
+         LANG_BINDINGS += $(obj-perf)python/perf$(PYTHON_EXTENSION_SUFFIX)
          CFLAGS += -DHAVE_LIBPYTHON_SUPPORT
          $(call detected,CONFIG_LIBPYTHON)
       endif
@@ -1184,9 +1186,11 @@ ifdef LIBPFM4
   endif
 endif
 
-ifdef LIBTRACEEVENT_DYNAMIC
+# libtraceevent is a recommended dependency picked up from the system.
+ifneq ($(NO_LIBTRACEEVENT),1)
   $(call feature_check,libtraceevent)
   ifeq ($(feature-libtraceevent), 1)
+    CFLAGS += -DHAVE_LIBTRACEEVENT
     EXTLIBS += -ltraceevent
     LIBTRACEEVENT_VERSION := $(shell $(PKG_CONFIG) --modversion libtraceevent)
     LIBTRACEEVENT_VERSION_1 := $(word 1, $(subst ., ,$(LIBTRACEEVENT_VERSION)))
@@ -1194,12 +1198,15 @@ ifdef LIBTRACEEVENT_DYNAMIC
     LIBTRACEEVENT_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEEVENT_VERSION)))
     LIBTRACEEVENT_VERSION_CPP := $(shell expr $(LIBTRACEEVENT_VERSION_1) \* 255 \* 255 + $(LIBTRACEEVENT_VERSION_2) \* 255 + $(LIBTRACEEVENT_VERSION_3))
     CFLAGS += -DLIBTRACEEVENT_VERSION=$(LIBTRACEEVENT_VERSION_CPP)
+    $(call detected,CONFIG_LIBTRACEEVENT)
+    LIBTRACEEVENT_VERSION_WITH_TEP_FIELD_IS_RELATIVE := $(shell expr 1 \* 255 \* 255 + 5 \* 255 + 0) # 1.5.0
+    ifeq ($(shell test $(LIBTRACEEVENT_VERSION_CPP) -gt $(LIBTRACEEVENT_VERSION_WITH_TEP_FIELD_IS_RELATIVE); echo $$?),0)
+      CFLAGS += -DHAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
+    endif
   else
-    dummy := $(error Error: No libtraceevent devel library found, please install libtraceevent-devel);
+    dummy := $(warning Warning: libtraceevent is missing limiting functionality, please install libtraceevent-dev/libtraceevent-devel)
   endif
-endif
 
-ifdef LIBTRACEFS_DYNAMIC
   $(call feature_check,libtracefs)
   ifeq ($(feature-libtracefs), 1)
     EXTLIBS += -ltracefs
@@ -1209,14 +1216,12 @@ ifdef LIBTRACEFS_DYNAMIC
     LIBTRACEFS_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEFS_VERSION)))
     LIBTRACEFS_VERSION_CPP := $(shell expr $(LIBTRACEFS_VERSION_1) \* 255 \* 255 + $(LIBTRACEFS_VERSION_2) \* 255 + $(LIBTRACEFS_VERSION_3))
     CFLAGS += -DLIBTRACEFS_VERSION=$(LIBTRACEFS_VERSION_CPP)
-  else
-    dummy := $(error Error: No libtracefs devel library found, please install libtracefs-dev);
   endif
 endif
 
 # Among the variables below, these:
 #   perfexecdir
-#   perf_include_dir
+#   libbpf_include_dir
 #   perf_examples_dir
 #   template_dir
 #   mandir
@@ -1239,7 +1244,8 @@ includedir = $(abspath $(prefix)/$(includedir_relative))
 mandir = share/man
 infodir = share/info
 perfexecdir = libexec/perf-core
-perf_include_dir = lib/perf/include
+# FIXME: system's libbpf header directory, where we expect to find bpf/bpf_helpers.h, for instance
+libbpf_include_dir = /usr/include
 perf_examples_dir = lib/perf/examples
 sharedir = $(prefix)/share
 template_dir = share/perf-core/templates
@@ -1272,7 +1278,7 @@ includedir_SQ = $(subst ','\'',$(includedir))
 mandir_SQ = $(subst ','\'',$(mandir))
 infodir_SQ = $(subst ','\'',$(infodir))
 perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
-perf_include_dir_SQ = $(subst ','\'',$(perf_include_dir))
+libbpf_include_dir_SQ = $(subst ','\'',$(libbpf_include_dir))
 perf_examples_dir_SQ = $(subst ','\'',$(perf_examples_dir))
 template_dir_SQ = $(subst ','\'',$(template_dir))
 htmldir_SQ = $(subst ','\'',$(htmldir))
@@ -1284,13 +1290,13 @@ srcdir_SQ = $(subst ','\'',$(srcdir))
 
 ifneq ($(filter /%,$(firstword $(perfexecdir))),)
 perfexec_instdir = $(perfexecdir)
-perf_include_instdir = $(perf_include_dir)
+perf_include_instdir = $(libbpf_include_dir)
 perf_examples_instdir = $(perf_examples_dir)
 STRACE_GROUPS_INSTDIR = $(STRACE_GROUPS_DIR)
 tip_instdir = $(tipdir)
 else
 perfexec_instdir = $(prefix)/$(perfexecdir)
-perf_include_instdir = $(prefix)/$(perf_include_dir)
+perf_include_instdir = $(prefix)/$(libbpf_include_dir)
 perf_examples_instdir = $(prefix)/$(perf_examples_dir)
 STRACE_GROUPS_INSTDIR = $(prefix)/$(STRACE_GROUPS_DIR)
 tip_instdir = $(prefix)/$(tipdir)
@@ -1352,7 +1358,7 @@ $(call detected_var,ETC_PERFCONFIG_SQ)
 $(call detected_var,STRACE_GROUPS_DIR_SQ)
 $(call detected_var,prefix_SQ)
 $(call detected_var,perfexecdir_SQ)
-$(call detected_var,perf_include_dir_SQ)
+$(call detected_var,libbpf_include_dir_SQ)
 $(call detected_var,perf_examples_dir_SQ)
 $(call detected_var,tipdir_SQ)
 $(call detected_var,srcdir_SQ)
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index a432e59afc42..9b7886ce0674 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -128,10 +128,6 @@ include ../scripts/utilities.mak
 #
 # Define BUILD_BPF_SKEL to enable BPF skeletons
 #
-# Define LIBTRACEEVENT_DYNAMIC to enable libtraceevent dynamic linking
-#
-# Define LIBTRACEFS_DYNAMIC to enable libtracefs dynamic linking
-#
 
 # As per kernel Makefile, avoid funny character set dependencies
 unexport LC_ALL
@@ -241,10 +237,10 @@ sub-make: fixdep
 
 else # force_fixdep
 
-LIB_DIR         = $(srctree)/tools/lib/api/
-TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/
+LIBAPI_DIR      = $(srctree)/tools/lib/api/
 LIBBPF_DIR      = $(srctree)/tools/lib/bpf/
-SUBCMD_DIR      = $(srctree)/tools/lib/subcmd/
+LIBSUBCMD_DIR   = $(srctree)/tools/lib/subcmd/
+LIBSYMBOL_DIR   = $(srctree)/tools/lib/symbol/
 LIBPERF_DIR     = $(srctree)/tools/lib/perf/
 DOC_DIR         = $(srctree)/tools/perf/Documentation/
 
@@ -292,36 +288,15 @@ grep-libs = $(filter -l%,$(1))
 strip-libs = $(filter-out -l%,$(1))
 
 ifneq ($(OUTPUT),)
-  TE_PATH=$(OUTPUT)
-  PLUGINS_PATH=$(OUTPUT)
-  SUBCMD_PATH=$(OUTPUT)
-  LIBPERF_PATH=$(OUTPUT)
-ifneq ($(subdir),)
-  API_PATH=$(OUTPUT)/../lib/api/
+  LIBAPI_OUTPUT = $(abspath $(OUTPUT))/libapi
 else
-  API_PATH=$(OUTPUT)
+  LIBAPI_OUTPUT = $(CURDIR)/libapi
 endif
-else
-  TE_PATH=$(TRACE_EVENT_DIR)
-  PLUGINS_PATH=$(TRACE_EVENT_DIR)plugins/
-  API_PATH=$(LIB_DIR)
-  SUBCMD_PATH=$(SUBCMD_DIR)
-  LIBPERF_PATH=$(LIBPERF_DIR)
-endif
-
-LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
-export LIBTRACEEVENT
-LIBTRACEEVENT_DYNAMIC_LIST = $(PLUGINS_PATH)libtraceevent-dynamic-list
-
-#
-# The static build has no dynsym table, so this does not work for
-# static build. Looks like linker starts to scream about that now
-# (in Fedora 26) so we need to switch it off for static build.
-DYNAMIC_LIST_LDFLAGS               = -Xlinker --dynamic-list=$(LIBTRACEEVENT_DYNAMIC_LIST)
-LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS = $(if $(findstring -static,$(LDFLAGS)),,$(DYNAMIC_LIST_LDFLAGS))
-
-LIBAPI = $(API_PATH)libapi.a
+LIBAPI_DESTDIR = $(LIBAPI_OUTPUT)
+LIBAPI_INCLUDE = $(LIBAPI_DESTDIR)/include
+LIBAPI = $(LIBAPI_OUTPUT)/libapi.a
 export LIBAPI
+CFLAGS += -I$(LIBAPI_OUTPUT)/include
 
 ifneq ($(OUTPUT),)
   LIBBPF_OUTPUT = $(abspath $(OUTPUT))/libbpf
@@ -331,11 +306,38 @@ endif
 LIBBPF_DESTDIR = $(LIBBPF_OUTPUT)
 LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
 LIBBPF = $(LIBBPF_OUTPUT)/libbpf.a
+CFLAGS += -I$(LIBBPF_OUTPUT)/include
 
-LIBSUBCMD = $(SUBCMD_PATH)libsubcmd.a
+ifneq ($(OUTPUT),)
+  LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
+else
+  LIBSUBCMD_OUTPUT = $(CURDIR)/libsubcmd
+endif
+LIBSUBCMD_DESTDIR = $(LIBSUBCMD_OUTPUT)
+LIBSUBCMD_INCLUDE = $(LIBSUBCMD_DESTDIR)/include
+LIBSUBCMD = $(LIBSUBCMD_OUTPUT)/libsubcmd.a
+CFLAGS += -I$(LIBSUBCMD_OUTPUT)/include
+
+ifneq ($(OUTPUT),)
+  LIBSYMBOL_OUTPUT = $(abspath $(OUTPUT))/libsymbol
+else
+  LIBSYMBOL_OUTPUT = $(CURDIR)/libsymbol
+endif
+LIBSYMBOL_DESTDIR = $(LIBSYMBOL_OUTPUT)
+LIBSYMBOL_INCLUDE = $(LIBSYMBOL_DESTDIR)/include
+LIBSYMBOL = $(LIBSYMBOL_OUTPUT)/libsymbol.a
+CFLAGS += -I$(LIBSYMBOL_OUTPUT)/include
 
-LIBPERF = $(LIBPERF_PATH)libperf.a
+ifneq ($(OUTPUT),)
+  LIBPERF_OUTPUT = $(abspath $(OUTPUT))/libperf
+else
+  LIBPERF_OUTPUT = $(CURDIR)/libperf
+endif
+LIBPERF_DESTDIR = $(LIBPERF_OUTPUT)
+LIBPERF_INCLUDE = $(LIBPERF_DESTDIR)/include
+LIBPERF = $(LIBPERF_OUTPUT)/libperf.a
 export LIBPERF
+CFLAGS += -I$(LIBPERF_OUTPUT)/include
 
 # python extension build directories
 PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
@@ -345,8 +347,13 @@ export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
 
 python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT)python/perf*.so
 
-PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
-PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPI)
+ifeq ($(CONFIG_LIBTRACEEVENT),y)
+  PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
+else
+  PYTHON_EXT_SRCS := $(shell grep -v '^\#\|util/trace-event.c' util/python-ext-sources)
+endif
+
+PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBAPI)
 
 SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
 
@@ -385,15 +392,12 @@ endif
 
 export PERL_PATH
 
-PERFLIBS = $(LIBAPI) $(LIBSUBCMD) $(LIBPERF)
+PERFLIBS = $(LIBAPI) $(LIBPERF) $(LIBSUBCMD) $(LIBSYMBOL)
 ifndef NO_LIBBPF
   ifndef LIBBPF_DYNAMIC
     PERFLIBS += $(LIBBPF)
   endif
 endif
-ifndef LIBTRACEEVENT_DYNAMIC
-  PERFLIBS += $(LIBTRACEEVENT)
-endif
 
 # We choose to avoid "if .. else if .. else .. endif endif"
 # because maintaining the nesting to match is a pain.  If
@@ -643,9 +647,9 @@ all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
 # Create python binding output directory if not already present
 _dummy := $(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python')
 
-$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBTRACEEVENT_DYNAMIC_LIST) $(LIBPERF)
+$(OUTPUT)python/perf$(PYTHON_EXTENSION_SUFFIX): $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS) $(LIBPERF)
 	$(QUIET_GEN)LDSHARED="$(CC) -pthread -shared" \
-        CFLAGS='$(CFLAGS)' LDFLAGS='$(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS)' \
+        CFLAGS='$(CFLAGS)' LDFLAGS='$(LDFLAGS)' \
 	  $(PYTHON_WORD) util/setup.py \
 	  --quiet build_ext; \
 	cp $(PYTHON_EXTBUILD_LIB)perf*.so $(OUTPUT)python/
@@ -668,14 +672,14 @@ build := -f $(srctree)/tools/build/Makefile.build dir=. obj
 $(PERF_IN): prepare FORCE
 	$(Q)$(MAKE) $(build)=perf
 
-$(PMU_EVENTS_IN): FORCE
+$(PMU_EVENTS_IN): FORCE prepare
 	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=pmu-events
 
-$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) $(LIBTRACEEVENT_DYNAMIC_LIST)
-	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS) \
+$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) \
 		$(PERF_IN) $(PMU_EVENTS_IN) $(LIBS) -o $@
 
-$(GTK_IN): FORCE
+$(GTK_IN): FORCE prepare
 	$(Q)$(MAKE) $(build)=gtk
 
 $(OUTPUT)libperf-gtk.so: $(GTK_IN) $(PERFLIBS)
@@ -751,6 +755,11 @@ prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders $(drm_ioc
 	$(rename_flags_array) \
 	$(arch_errno_name_array) \
 	$(sync_file_range_arrays) \
+	$(LIBAPI) \
+	$(LIBBPF) \
+	$(LIBPERF) \
+	$(LIBSUBCMD) \
+	$(LIBSYMBOL) \
 	bpf-skel
 
 $(OUTPUT)%.o: %.c prepare FORCE
@@ -808,30 +817,14 @@ endif
 
 $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h)
 
-LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(filter-out -static,$(LDFLAGS))'
-
-$(LIBTRACEEVENT): FORCE
-	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent.a
-
-libtraceevent_plugins: FORCE
-	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR)plugins $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) plugins
-
-$(LIBTRACEEVENT_DYNAMIC_LIST): libtraceevent_plugins
-	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR)plugins $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) $(OUTPUT)libtraceevent-dynamic-list
-
-$(LIBTRACEEVENT)-clean:
-	$(call QUIET_CLEAN, libtraceevent)
-	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
-
-install-traceevent-plugins: libtraceevent_plugins
-	$(Q)$(MAKE) -C $(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) O=$(OUTPUT) install_plugins
-
-$(LIBAPI): FORCE
-	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) $(OUTPUT)libapi.a
+$(LIBAPI): FORCE | $(LIBAPI_OUTPUT)
+	$(Q)$(MAKE) -C $(LIBAPI_DIR) O=$(LIBAPI_OUTPUT) \
+		DESTDIR=$(LIBAPI_DESTDIR) prefix= \
+		$@ install_headers
 
 $(LIBAPI)-clean:
 	$(call QUIET_CLEAN, libapi)
-	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
+	$(Q)$(RM) -r -- $(LIBAPI_OUTPUT)
 
 $(LIBBPF): FORCE | $(LIBBPF_OUTPUT)
 	$(Q)$(MAKE) -C $(LIBBPF_DIR) FEATURES_DUMP=$(FEATURE_DUMP_EXPORT) \
@@ -842,18 +835,32 @@ $(LIBBPF)-clean:
 	$(call QUIET_CLEAN, libbpf)
 	$(Q)$(RM) -r -- $(LIBBPF_OUTPUT)
 
-$(LIBPERF): FORCE
-	$(Q)$(MAKE) -C $(LIBPERF_DIR) EXTRA_CFLAGS="$(LIBPERF_CFLAGS)" O=$(OUTPUT) $(OUTPUT)libperf.a
+$(LIBPERF): FORCE | $(LIBPERF_OUTPUT)
+	$(Q)$(MAKE) -C $(LIBPERF_DIR) O=$(LIBPERF_OUTPUT) \
+		DESTDIR=$(LIBPERF_DESTDIR) prefix= \
+		$@ install_headers
 
 $(LIBPERF)-clean:
 	$(call QUIET_CLEAN, libperf)
-	$(Q)$(MAKE) -C $(LIBPERF_DIR) O=$(OUTPUT) clean >/dev/null
+	$(Q)$(RM) -r -- $(LIBPERF_OUTPUT)
 
-$(LIBSUBCMD): FORCE
-	$(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) $(OUTPUT)libsubcmd.a
+$(LIBSUBCMD): FORCE | $(LIBSUBCMD_OUTPUT)
+	$(Q)$(MAKE) -C $(LIBSUBCMD_DIR) O=$(LIBSUBCMD_OUTPUT) \
+		DESTDIR=$(LIBSUBCMD_DESTDIR) prefix= \
+		$@ install_headers
 
 $(LIBSUBCMD)-clean:
-	$(Q)$(MAKE) -C $(SUBCMD_DIR) O=$(OUTPUT) clean
+	$(call QUIET_CLEAN, libsubcmd)
+	$(Q)$(RM) -r -- $(LIBSUBCMD_OUTPUT)
+
+$(LIBSYMBOL): FORCE | $(LIBSYMBOL_OUTPUT)
+	$(Q)$(MAKE) -C $(LIBSYMBOL_DIR) O=$(LIBSYMBOL_OUTPUT) \
+		DESTDIR=$(LIBSYMBOL_DESTDIR) prefix= \
+		$@ install_headers
+
+$(LIBSYMBOL)-clean:
+	$(call QUIET_CLEAN, libsymbol)
+	$(Q)$(RM) -r -- $(LIBSYMBOL_OUTPUT)
 
 help:
 	@echo 'Perf make targets:'
@@ -960,11 +967,6 @@ endif
 	$(call QUIET_INSTALL, libexec) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifndef NO_LIBBPF
-	$(call QUIET_INSTALL, bpf-headers) \
-		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'; \
-		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf/linux'; \
-		$(INSTALL) include/bpf/*.h -m 644 -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf'; \
-		$(INSTALL) include/bpf/linux/*.h -m 644 -t '$(DESTDIR_SQ)$(perf_include_instdir_SQ)/bpf/linux'
 	$(call QUIET_INSTALL, bpf-examples) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'; \
 		$(INSTALL) examples/bpf/*.c -m 644 -t '$(DESTDIR_SQ)$(perf_examples_instdir_SQ)/bpf'
@@ -1020,7 +1022,7 @@ install-tests: all install-gtk
 		$(INSTALL) tests/shell/coresight/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight'
 	$(Q)$(MAKE) -C tests/shell/coresight install-tests
 
-install-bin: install-tools install-tests install-traceevent-plugins
+install-bin: install-tools install-tests
 
 install: install-bin try-install-man
 
@@ -1044,7 +1046,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
 SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
 SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h
 
-$(SKEL_TMP_OUT) $(LIBBPF_OUTPUT):
+$(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
 	$(Q)$(MKDIR) -p $@
 
 ifdef BUILD_BPF_SKEL
@@ -1089,7 +1091,7 @@ endif # BUILD_BPF_SKEL
 bpf-skel-clean:
 	$(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS)
 
-clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean
+clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean
 	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS)
 	$(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)$(RM) $(OUTPUT).config-detected
@@ -1146,6 +1148,6 @@ FORCE:
 .PHONY: all install clean config-clean strip install-gtk
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope FORCE prepare
-.PHONY: libtraceevent_plugins archheaders
+.PHONY: archheaders
 
 endif # force_fixdep
diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c
index b7692cb0c733..1834a0cd9ce3 100644
--- a/tools/perf/arch/arm/util/unwind-libdw.c
+++ b/tools/perf/arch/arm/util/unwind-libdw.c
@@ -2,7 +2,7 @@
 #include <elfutils/libdwfl.h>
 #include "../../../util/unwind-libdw.h"
 #include "../../../util/perf_regs.h"
-#include "../../../util/event.h"
+#include "../../../util/sample.h"
 
 bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
diff --git a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl
index 459469b7222c..a7ca48d1e37b 100755
--- a/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl
+++ b/tools/perf/arch/arm64/entry/syscalls/mksyscalltbl
@@ -58,5 +58,5 @@ create_table()
 
 $gcc -E -dM -x c -I $incpath/include/uapi $input \
 	|sed -ne 's/^#define __NR_//p' \
-	|sort -t' ' -k2 -nu	       \
+	|sort -t' ' -k2 -n	       \
 	|create_table
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index 337aa9bdf905..78ef7115be3d 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -3,7 +3,7 @@ perf-y += machine.o
 perf-y += perf_regs.o
 perf-y += tsc.o
 perf-y += pmu.o
-perf-y += kvm-stat.o
+perf-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-$(CONFIG_DWARF)     += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index 41c1596e5207..235a0a1e1ec7 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -7,6 +7,7 @@
 #include "symbol.h"
 #include "callchain.h"
 #include "record.h"
+#include "util/perf_regs.h"
 
 void arch__add_leaf_frame_record_opts(struct record_opts *opts)
 {
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
index f849b1e88d43..477e513972a4 100644
--- a/tools/perf/arch/arm64/util/pmu.c
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <internal/cpumap.h>
 #include "../../../util/cpumap.h"
 #include "../../../util/pmu.h"
 
diff --git a/tools/perf/arch/arm64/util/unwind-libdw.c b/tools/perf/arch/arm64/util/unwind-libdw.c
index a50941629649..09385081bb03 100644
--- a/tools/perf/arch/arm64/util/unwind-libdw.c
+++ b/tools/perf/arch/arm64/util/unwind-libdw.c
@@ -2,7 +2,7 @@
 #include <elfutils/libdwfl.h>
 #include "../../../util/unwind-libdw.h"
 #include "../../../util/perf_regs.h"
-#include "../../../util/event.h"
+#include "../../../util/sample.h"
 
 bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 0115f3166568..9889245c555c 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,5 +1,5 @@
 perf-y += header.o
-perf-y += kvm-stat.o
+perf-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += mem-events.o
 perf-y += sym-handling.o
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
index cf430a4c55b9..77d8cc2b5691 100644
--- a/tools/perf/arch/powerpc/util/event.c
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -9,6 +9,7 @@
 #include "../../../util/tool.h"
 #include "../../../util/map.h"
 #include "../../../util/debug.h"
+#include "../../../util/sample.h"
 
 void arch_perf_parse_sample_weight(struct perf_sample *data,
 				   const __u64 *array, u64 type)
diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c
index 7b2d96ec28e3..e616642c754c 100644
--- a/tools/perf/arch/powerpc/util/unwind-libdw.c
+++ b/tools/perf/arch/powerpc/util/unwind-libdw.c
@@ -3,7 +3,7 @@
 #include <linux/kernel.h>
 #include "../../../util/unwind-libdw.h"
 #include "../../../util/perf_regs.h"
-#include "../../../util/event.h"
+#include "../../../util/sample.h"
 
 /* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils.  */
 static const int special_regs[3][2] = {
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index 7d3050134ae0..603dbb5ae4dc 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -1,4 +1,5 @@
 perf-y += perf_regs.o
+perf-y += header.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/riscv/util/header.c b/tools/perf/arch/riscv/util/header.c
new file mode 100644
index 000000000000..4a41856938a8
--- /dev/null
+++ b/tools/perf/arch/riscv/util/header.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implementation of get_cpuid().
+ *
+ * Author: Nikita Shubin <n.shubin@yadro.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <api/fs/fs.h>
+#include <errno.h>
+#include "../../util/debug.h"
+#include "../../util/header.h"
+
+#define CPUINFO_MVEN	"mvendorid"
+#define CPUINFO_MARCH	"marchid"
+#define CPUINFO_MIMP	"mimpid"
+#define CPUINFO		"/proc/cpuinfo"
+
+static char *_get_field(const char *line)
+{
+	char *line2, *nl;
+
+	line2 = strrchr(line, ' ');
+	if (!line2)
+		return NULL;
+
+	line2++;
+	nl = strrchr(line, '\n');
+	if (!nl)
+		return NULL;
+
+	return strndup(line2, nl - line2);
+}
+
+static char *_get_cpuid(void)
+{
+	char *line = NULL;
+	char *mvendorid = NULL;
+	char *marchid = NULL;
+	char *mimpid = NULL;
+	char *cpuid = NULL;
+	int read;
+	unsigned long line_sz;
+	FILE *cpuinfo;
+
+	cpuinfo = fopen(CPUINFO, "r");
+	if (cpuinfo == NULL)
+		return cpuid;
+
+	while ((read = getline(&line, &line_sz, cpuinfo)) != -1) {
+		if (!strncmp(line, CPUINFO_MVEN, strlen(CPUINFO_MVEN))) {
+			mvendorid = _get_field(line);
+			if (!mvendorid)
+				goto free;
+		} else if (!strncmp(line, CPUINFO_MARCH, strlen(CPUINFO_MARCH))) {
+			marchid = _get_field(line);
+			if (!marchid)
+				goto free;
+		} else if (!strncmp(line, CPUINFO_MIMP, strlen(CPUINFO_MIMP))) {
+			mimpid = _get_field(line);
+			if (!mimpid)
+				goto free;
+
+			break;
+		}
+	}
+
+	if (!mvendorid || !marchid || !mimpid)
+		goto free;
+
+	if (asprintf(&cpuid, "%s-%s-%s", mvendorid, marchid, mimpid) < 0)
+		cpuid = NULL;
+
+free:
+	fclose(cpuinfo);
+	free(mvendorid);
+	free(marchid);
+	free(mimpid);
+
+	return cpuid;
+}
+
+int get_cpuid(char *buffer, size_t sz)
+{
+	char *cpuid = _get_cpuid();
+	int ret = 0;
+
+	if (sz < strlen(cpuid)) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	scnprintf(buffer, sz, "%s", cpuid);
+free:
+	free(cpuid);
+	return ret;
+}
+
+char *
+get_cpuid_str(struct perf_pmu *pmu __maybe_unused)
+{
+	return _get_cpuid();
+}
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
index 3d9d0f4f72ca..db6884086997 100644
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
@@ -1,5 +1,5 @@
 perf-y += header.o
-perf-y += kvm-stat.o
+perf-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-y += perf_regs.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
diff --git a/tools/perf/arch/s390/util/unwind-libdw.c b/tools/perf/arch/s390/util/unwind-libdw.c
index 387c698cdd1b..7d92452d5287 100644
--- a/tools/perf/arch/s390/util/unwind-libdw.c
+++ b/tools/perf/arch/s390/util/unwind-libdw.c
@@ -3,6 +3,7 @@
 #include "../../util/unwind-libdw.h"
 #include "../../util/perf_regs.h"
 #include "../../util/event.h"
+#include "../../util/sample.h"
 #include "dwarf-regs-table.h"
 
 
diff --git a/tools/perf/arch/x86/include/arch-tests.h b/tools/perf/arch/x86/include/arch-tests.h
index 6a1a1b3c0827..902e9ea9b99e 100644
--- a/tools/perf/arch/x86/include/arch-tests.h
+++ b/tools/perf/arch/x86/include/arch-tests.h
@@ -8,6 +8,7 @@ struct test_suite;
 int test__rdpmc(struct test_suite *test, int subtest);
 int test__insn_x86(struct test_suite *test, int subtest);
 int test__intel_pt_pkt_decoder(struct test_suite *test, int subtest);
+int test__intel_pt_hybrid_compat(struct test_suite *test, int subtest);
 int test__bp_modify(struct test_suite *test, int subtest);
 int test__x86_sample_parsing(struct test_suite *test, int subtest);
 
diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build
index 70b5bcbc15df..6f4e8636c3bf 100644
--- a/tools/perf/arch/x86/tests/Build
+++ b/tools/perf/arch/x86/tests/Build
@@ -3,5 +3,5 @@ perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
 
 perf-y += arch-tests.o
 perf-y += sample-parsing.o
-perf-$(CONFIG_AUXTRACE) += insn-x86.o intel-pt-pkt-decoder-test.o
+perf-$(CONFIG_AUXTRACE) += insn-x86.o intel-pt-test.o
 perf-$(CONFIG_X86_64) += bp-modify.o
diff --git a/tools/perf/arch/x86/tests/arch-tests.c b/tools/perf/arch/x86/tests/arch-tests.c
index 04018b8aa85b..aae6ea0fe52b 100644
--- a/tools/perf/arch/x86/tests/arch-tests.c
+++ b/tools/perf/arch/x86/tests/arch-tests.c
@@ -5,7 +5,18 @@
 
 #ifdef HAVE_AUXTRACE_SUPPORT
 DEFINE_SUITE("x86 instruction decoder - new instructions", insn_x86);
-DEFINE_SUITE("Intel PT packet decoder", intel_pt_pkt_decoder);
+
+static struct test_case intel_pt_tests[] = {
+	TEST_CASE("Intel PT packet decoder", intel_pt_pkt_decoder),
+	TEST_CASE("Intel PT hybrid CPU compatibility", intel_pt_hybrid_compat),
+	{ .name = NULL, }
+};
+
+struct test_suite suite__intel_pt = {
+	.desc = "Intel PT",
+	.test_cases = intel_pt_tests,
+};
+
 #endif
 #if defined(__x86_64__)
 DEFINE_SUITE("x86 bp modify", bp_modify);
@@ -18,7 +29,7 @@ struct test_suite *arch_tests[] = {
 #endif
 #ifdef HAVE_AUXTRACE_SUPPORT
 	&suite__insn_x86,
-	&suite__intel_pt_pkt_decoder,
+	&suite__intel_pt,
 #endif
 #if defined(__x86_64__)
 	&suite__bp_modify,
diff --git a/tools/perf/arch/x86/tests/intel-pt-pkt-decoder-test.c b/tools/perf/arch/x86/tests/intel-pt-test.c
index 42237656f453..70b7f79396b1 100644
--- a/tools/perf/arch/x86/tests/intel-pt-pkt-decoder-test.c
+++ b/tools/perf/arch/x86/tests/intel-pt-test.c
@@ -1,12 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/compiler.h>
+#include <linux/bits.h>
 #include <string.h>
+#include <cpuid.h>
+#include <sched.h>
 
 #include "intel-pt-decoder/intel-pt-pkt-decoder.h"
 
 #include "debug.h"
 #include "tests/tests.h"
 #include "arch-tests.h"
+#include "cpumap.h"
 
 /**
  * struct test_data - Test data.
@@ -313,3 +318,152 @@ int test__intel_pt_pkt_decoder(struct test_suite *test __maybe_unused, int subte
 
 	return TEST_OK;
 }
+
+static int setaffinity(int cpu)
+{
+	cpu_set_t cpu_set;
+
+	CPU_ZERO(&cpu_set);
+	CPU_SET(cpu, &cpu_set);
+	if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set)) {
+		pr_debug("sched_setaffinity() failed for CPU %d\n", cpu);
+		return -1;
+	}
+	return 0;
+}
+
+#define INTEL_PT_ADDR_FILT_CNT_MASK	GENMASK(2, 0)
+#define INTEL_PT_SUBLEAF_CNT		2
+#define CPUID_REG_CNT			4
+
+struct cpuid_result {
+	union {
+		struct {
+			unsigned int eax;
+			unsigned int ebx;
+			unsigned int ecx;
+			unsigned int edx;
+		};
+		unsigned int reg[CPUID_REG_CNT];
+	};
+};
+
+struct pt_caps {
+	struct cpuid_result subleaf[INTEL_PT_SUBLEAF_CNT];
+};
+
+static int get_pt_caps(int cpu, struct pt_caps *caps)
+{
+	struct cpuid_result r;
+	int i;
+
+	if (setaffinity(cpu))
+		return -1;
+
+	memset(caps, 0, sizeof(*caps));
+
+	for (i = 0; i < INTEL_PT_SUBLEAF_CNT; i++) {
+		__get_cpuid_count(20, i, &r.eax, &r.ebx, &r.ecx, &r.edx);
+		pr_debug("CPU %d CPUID leaf 20 subleaf %d\n", cpu, i);
+		pr_debug("eax = 0x%08x\n", r.eax);
+		pr_debug("ebx = 0x%08x\n", r.ebx);
+		pr_debug("ecx = 0x%08x\n", r.ecx);
+		pr_debug("edx = 0x%08x\n", r.edx);
+		caps->subleaf[i] = r;
+	}
+
+	return 0;
+}
+
+static bool is_hydrid(void)
+{
+	unsigned int eax, ebx, ecx, edx = 0;
+	bool result;
+
+	__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+	result = edx & BIT(15);
+	pr_debug("Is %shybrid : CPUID leaf 7 subleaf 0 edx %#x (bit-15 indicates hybrid)\n",
+		 result ? "" : "not ", edx);
+	return result;
+}
+
+static int compare_caps(int cpu, struct pt_caps *caps, struct pt_caps *caps0)
+{
+	struct pt_caps mask = { /* Mask of bits to check*/
+		.subleaf = {
+			[0] = {
+				.ebx = GENMASK(8, 0),
+				.ecx = GENMASK(3, 0),
+			},
+			[1] = {
+				.eax = GENMASK(31, 16),
+				.ebx = GENMASK(31, 0),
+			}
+		}
+	};
+	unsigned int m, reg, reg0;
+	int ret = 0;
+	int i, j;
+
+	for (i = 0; i < INTEL_PT_SUBLEAF_CNT; i++) {
+		for (j = 0; j < CPUID_REG_CNT; j++) {
+			m = mask.subleaf[i].reg[j];
+			reg = m & caps->subleaf[i].reg[j];
+			reg0 = m & caps0->subleaf[i].reg[j];
+			if ((reg & reg0) != reg0) {
+				pr_debug("CPU %d subleaf %d reg %d FAIL %#x vs %#x\n",
+					 cpu, i, j, reg, reg0);
+				ret = -1;
+			}
+		}
+	}
+
+	m = INTEL_PT_ADDR_FILT_CNT_MASK;
+	reg = m & caps->subleaf[1].eax;
+	reg0 = m & caps0->subleaf[1].eax;
+	if (reg < reg0) {
+		pr_debug("CPU %d subleaf 1 reg 0 FAIL address filter count %#x vs %#x\n",
+			 cpu, reg, reg0);
+		ret = -1;
+	}
+
+	if (!ret)
+		pr_debug("CPU %d OK\n", cpu);
+
+	return ret;
+}
+
+int test__intel_pt_hybrid_compat(struct test_suite *test, int subtest)
+{
+	int max_cpu = cpu__max_cpu().cpu;
+	struct pt_caps last_caps;
+	struct pt_caps caps0;
+	int ret = TEST_OK;
+	int cpu;
+
+	if (!is_hydrid()) {
+		test->test_cases[subtest].skip_reason = "not hybrid";
+		return TEST_SKIP;
+	}
+
+	if (get_pt_caps(0, &caps0))
+		return TEST_FAIL;
+
+	for (cpu = 1, last_caps = caps0; cpu < max_cpu; cpu++) {
+		struct pt_caps caps;
+
+		if (get_pt_caps(cpu, &caps)) {
+			pr_debug("CPU %d not found\n", cpu);
+			continue;
+		}
+		if (!memcmp(&caps, &last_caps, sizeof(caps))) {
+			pr_debug("CPU %d same caps as previous CPU\n", cpu);
+			continue;
+		}
+		if (compare_caps(cpu, &caps, &caps0))
+			ret = TEST_FAIL;
+		last_caps = caps;
+	}
+
+	return ret;
+}
diff --git a/tools/perf/arch/x86/tests/sample-parsing.c b/tools/perf/arch/x86/tests/sample-parsing.c
index bfbd3662b69e..690c7c07e90d 100644
--- a/tools/perf/arch/x86/tests/sample-parsing.c
+++ b/tools/perf/arch/x86/tests/sample-parsing.c
@@ -10,6 +10,7 @@
 #include "event.h"
 #include "evsel.h"
 #include "debug.h"
+#include "util/sample.h"
 #include "util/synthetic-events.h"
 
 #include "tests/tests.h"
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index dbeb04cb336e..195ccfdef7aa 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -1,7 +1,7 @@
 perf-y += header.o
 perf-y += tsc.o
 perf-y += pmu.o
-perf-y += kvm-stat.o
+perf-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += topdown.o
 perf-y += machine.o
diff --git a/tools/perf/arch/x86/util/event.c b/tools/perf/arch/x86/util/event.c
index e670f3547581..a3acefe6d0c6 100644
--- a/tools/perf/arch/x86/util/event.c
+++ b/tools/perf/arch/x86/util/event.c
@@ -2,6 +2,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/zalloc.h>
+#include <stdlib.h>
 
 #include "../../../util/event.h"
 #include "../../../util/synthetic-events.h"
@@ -9,6 +10,7 @@
 #include "../../../util/tool.h"
 #include "../../../util/map.h"
 #include "../../../util/debug.h"
+#include "util/sample.h"
 
 #if defined(__x86_64__)
 
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index af102f471e9f..1e39a034cee9 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -418,6 +418,7 @@ static int intel_pt_info_fill(struct auxtrace_record *itr,
 	return 0;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int intel_pt_track_switches(struct evlist *evlist)
 {
 	const char *sched_switch = "sched:sched_switch";
@@ -439,6 +440,7 @@ static int intel_pt_track_switches(struct evlist *evlist)
 
 	return 0;
 }
+#endif
 
 static void intel_pt_valid_str(char *str, size_t len, u64 valid)
 {
@@ -829,6 +831,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 					ptr->have_sched_switch = 2;
 			}
 		} else {
+#ifdef HAVE_LIBTRACEEVENT
 			err = intel_pt_track_switches(evlist);
 			if (err == -EPERM)
 				pr_debug2("Unable to select sched:sched_switch\n");
@@ -836,6 +839,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 				return err;
 			else
 				ptr->have_sched_switch = 1;
+#endif
 		}
 	}
 
diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c
index 404de795ec0b..7eb0a7b00b95 100644
--- a/tools/perf/arch/x86/util/iostat.c
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -449,7 +449,7 @@ void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
 
 void iostat_print_counters(struct evlist *evlist,
 			   struct perf_stat_config *config, struct timespec *ts,
-			   char *prefix, iostat_print_counter_t print_cnt_cb)
+			   char *prefix, iostat_print_counter_t print_cnt_cb, void *arg)
 {
 	void *perf_device = NULL;
 	struct evsel *counter = evlist__first(evlist);
@@ -464,7 +464,7 @@ void iostat_print_counters(struct evlist *evlist,
 			iostat_prefix(evlist, config, prefix, ts);
 			fprintf(config->output, "\n%s", prefix);
 		}
-		print_cnt_cb(config, counter, prefix);
+		print_cnt_cb(config, counter, arg);
 	}
 	fputc('\n', config->output);
 }
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index eb2b5195bd02..9b99f48b923c 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -2,6 +2,7 @@
 #include <linux/types.h>
 #include <math.h>
 #include <string.h>
+#include <stdlib.h>
 
 #include "../../../util/debug.h"
 #include "../../../util/tsc.h"
diff --git a/tools/perf/arch/x86/util/unwind-libdw.c b/tools/perf/arch/x86/util/unwind-libdw.c
index eea2bf87232b..ef71e8bf80bf 100644
--- a/tools/perf/arch/x86/util/unwind-libdw.c
+++ b/tools/perf/arch/x86/util/unwind-libdw.c
@@ -2,7 +2,7 @@
 #include <elfutils/libdwfl.h>
 #include "../../../util/unwind-libdw.h"
 #include "../../../util/perf_regs.h"
-#include "../../../util/event.h"
+#include "util/sample.h"
 
 bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c
index 22b5cfe97023..d103c3136983 100644
--- a/tools/perf/bench/find-bit-bench.c
+++ b/tools/perf/bench/find-bit-bench.c
@@ -70,7 +70,7 @@ static int do_for_each_set_bit(unsigned int num_bits)
 		bitmap_zero(to_test, num_bits);
 		skip = num_bits / set_bits;
 		for (i = 0; i < num_bits; i += skip)
-			set_bit(i, to_test);
+			__set_bit(i, to_test);
 
 		for (i = 0; i < outer_iterations; i++) {
 			old = accumulator;
diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c
index 17672790f123..4561bda0ce6a 100644
--- a/tools/perf/bench/inject-buildid.c
+++ b/tools/perf/bench/inject-buildid.c
@@ -19,10 +19,10 @@
 #include "util/data.h"
 #include "util/stat.h"
 #include "util/debug.h"
-#include "util/event.h"
 #include "util/symbol.h"
 #include "util/session.h"
 #include "util/build-id.h"
+#include "util/sample.h"
 #include "util/synthetic-events.h"
 
 #define MMAP_DEV_MAJOR  8
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index e78dedf9e682..9717c6c17433 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -16,6 +16,7 @@
 #include <sched.h>
 #include <stdio.h>
 #include <assert.h>
+#include <debug.h>
 #include <malloc.h>
 #include <signal.h>
 #include <stdlib.h>
@@ -116,7 +117,6 @@ struct params {
 	long			bytes_thread;
 
 	int			nr_tasks;
-	bool			show_quiet;
 
 	bool			show_convergence;
 	bool			measure_convergence;
@@ -197,7 +197,8 @@ static const struct option options[] = {
 	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, "
 		    "convergence is reached when each process (all its threads) is running on a single NUMA node."),
 	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
-	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"quiet mode"),
+	OPT_BOOLEAN('q', "quiet"	, &quiet,
+		    "quiet mode (do not show any warnings or messages)"),
 	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
 
 	/* Special option string parsing callbacks: */
@@ -1474,7 +1475,7 @@ static int init(void)
 	/* char array in count_process_nodes(): */
 	BUG_ON(g->p.nr_nodes < 0);
 
-	if (g->p.show_quiet && !g->p.show_details)
+	if (quiet && !g->p.show_details)
 		g->p.show_details = -1;
 
 	/* Some memory should be specified: */
@@ -1553,7 +1554,7 @@ static void print_res(const char *name, double val,
 	if (!name)
 		name = "main,";
 
-	if (!g->p.show_quiet)
+	if (!quiet)
 		printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
 	else
 		printf(" %14.3f %s\n", val, txt_long);
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index f839e69492e8..90458ca6933f 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -499,7 +499,9 @@ int cmd_annotate(int argc, const char **argv)
 			.namespaces = perf_event__process_namespaces,
 			.attr	= perf_event__process_attr,
 			.build_id = perf_event__process_build_id,
+#ifdef HAVE_LIBTRACEEVENT
 			.tracing_data   = perf_event__process_tracing_data,
+#endif
 			.id_index	= perf_event__process_id_index,
 			.auxtrace_info	= perf_event__process_auxtrace_info,
 			.auxtrace	= perf_event__process_auxtrace,
@@ -525,7 +527,7 @@ int cmd_annotate(int argc, const char **argv)
 	OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('q', "quiet", &quiet, "do now show any message"),
+	OPT_BOOLEAN('q', "quiet", &quiet, "do now show any warnings or messages"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 #ifdef HAVE_GTK2_SUPPORT
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index a9190458d2d5..52d94c7dd836 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -230,7 +230,7 @@ static void c2c_he__set_cpu(struct c2c_hist_entry *c2c_he,
 		      "WARNING: no sample cpu value"))
 		return;
 
-	set_bit(sample->cpu, c2c_he->cpuset);
+	__set_bit(sample->cpu, c2c_he->cpuset);
 }
 
 static void c2c_he__set_node(struct c2c_hist_entry *c2c_he,
@@ -247,7 +247,7 @@ static void c2c_he__set_node(struct c2c_hist_entry *c2c_he,
 	if (WARN_ONCE(node < 0, "WARNING: failed to find node\n"))
 		return;
 
-	set_bit(node, c2c_he->nodeset);
+	__set_bit(node, c2c_he->nodeset);
 
 	if (c2c_he->paddr != sample->phys_addr) {
 		c2c_he->paddr_cnt++;
@@ -2318,7 +2318,7 @@ static int setup_nodes(struct perf_session *session)
 			continue;
 
 		perf_cpu_map__for_each_cpu(cpu, idx, map) {
-			set_bit(cpu.cpu, set);
+			__set_bit(cpu.cpu, set);
 
 			if (WARN_ONCE(cpu2node[cpu.cpu] != -1, "node/cpu topology bug"))
 				return -EINVAL;
diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index 6cb3f6cc36d0..7036ec92d47d 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -100,12 +100,12 @@ static struct daemon __daemon = {
 };
 
 static const char * const daemon_usage[] = {
-	"perf daemon start [<options>]",
+	"perf daemon {start|signal|stop|ping} [<options>]",
 	"perf daemon [<options>]",
 	NULL
 };
 
-static bool done;
+static volatile sig_atomic_t done;
 
 static void sig_handler(int sig __maybe_unused)
 {
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index c22d82d2a73c..b2a9a3b7f68d 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -78,12 +78,13 @@ static int cmd_data_convert(int argc, const char **argv)
 		return bt_convert__perf2json(input_name, to_json, &opts);
 
 	if (to_ctf) {
-#ifdef HAVE_LIBBABELTRACE_SUPPORT
+#if defined(HAVE_LIBBABELTRACE_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 		return bt_convert__perf2ctf(input_name, to_ctf, &opts);
 #else
 		pr_err("The libbabeltrace support is not compiled in. perf should be "
 		       "compiled with environment variables LIBBABELTRACE=1 and "
-		       "LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n");
+		       "LIBBABELTRACE_DIR=/path/to/libbabeltrace/.\n"
+		       "Check also if libbtraceevent devel files are available.\n");
 		return -1;
 #endif
 	}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index d925096dd7f0..ed07cc6cca56 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1260,7 +1260,7 @@ static const char * const diff_usage[] = {
 static const struct option options[] = {
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
+	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any warnings or messages"),
 	OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
 		    "Show only items with match in baseline"),
 	OPT_CALLBACK('c', "compute", &compute,
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 7de07bb16d23..d7fe00f66b83 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -36,8 +36,8 @@
 
 #define DEFAULT_TRACER  "function_graph"
 
-static volatile int workload_exec_errno;
-static bool done;
+static volatile sig_atomic_t workload_exec_errno;
+static volatile sig_atomic_t done;
 
 static void sig_handler(int sig __maybe_unused)
 {
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index e254f18986f7..3f4e4dd5abf3 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -607,6 +607,7 @@ static int perf_event__repipe_exit(struct perf_tool *tool,
 	return err;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int perf_event__repipe_tracing_data(struct perf_session *session,
 					   union perf_event *event)
 {
@@ -614,6 +615,7 @@ static int perf_event__repipe_tracing_data(struct perf_session *session,
 
 	return perf_event__process_tracing_data(session, event);
 }
+#endif
 
 static int dso__read_build_id(struct dso *dso)
 {
@@ -807,6 +809,7 @@ static int perf_inject__sched_switch(struct perf_tool *tool,
 	return 0;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int perf_inject__sched_stat(struct perf_tool *tool,
 				   union perf_event *event __maybe_unused,
 				   struct perf_sample *sample,
@@ -836,6 +839,7 @@ found:
 	build_id__mark_dso_hit(tool, event_sw, &sample_sw, evsel, machine);
 	return perf_event__repipe(tool, event_sw, &sample_sw, machine);
 }
+#endif
 
 static struct guest_vcpu *guest_session__vcpu(struct guest_session *gs, u32 vcpu)
 {
@@ -1961,7 +1965,9 @@ static int __cmd_inject(struct perf_inject *inject)
 		inject->tool.mmap	  = perf_event__repipe_mmap;
 		inject->tool.mmap2	  = perf_event__repipe_mmap2;
 		inject->tool.fork	  = perf_event__repipe_fork;
+#ifdef HAVE_LIBTRACEEVENT
 		inject->tool.tracing_data = perf_event__repipe_tracing_data;
+#endif
 	}
 
 	output_data_offset = perf_session__data_offset(session->evlist);
@@ -1984,8 +1990,10 @@ static int __cmd_inject(struct perf_inject *inject)
 				evsel->handler = perf_inject__sched_switch;
 			} else if (!strcmp(name, "sched:sched_process_exit"))
 				evsel->handler = perf_inject__sched_process_exit;
+#ifdef HAVE_LIBTRACEEVENT
 			else if (!strncmp(name, "sched:sched_stat_", 17))
 				evsel->handler = perf_inject__sched_stat;
+#endif
 		}
 	} else if (inject->itrace_synth_opts.vm_time_correlation) {
 		session->itrace_synth_opts = &inject->itrace_synth_opts;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index ebfab2ca1702..e20656c431a4 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -35,6 +35,7 @@
 #include <regex.h>
 
 #include <linux/ctype.h>
+#include <traceevent/event-parse.h>
 
 static int	kmem_slab;
 static int	kmem_page;
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 7d9ec1bac1a2..641e739c717c 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -63,7 +63,7 @@ static const char *get_filename_for_perf_kvm(void)
 	return filename;
 }
 
-#ifdef HAVE_KVM_STAT_SUPPORT
+#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 
 void exit_event_get_key(struct evsel *evsel,
 			struct perf_sample *sample,
@@ -654,7 +654,7 @@ static void print_result(struct perf_kvm_stat *kvm)
 		pr_info("\nLost events: %" PRIu64 "\n\n", kvm->lost_events);
 }
 
-#ifdef HAVE_TIMERFD_SUPPORT
+#if defined(HAVE_TIMERFD_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 static int process_lost_event(struct perf_tool *tool,
 			      union perf_event *event __maybe_unused,
 			      struct perf_sample *sample __maybe_unused,
@@ -742,7 +742,7 @@ static bool verify_vcpu(int vcpu)
 	return true;
 }
 
-#ifdef HAVE_TIMERFD_SUPPORT
+#if defined(HAVE_TIMERFD_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 /* keeping the max events to a modest level to keep
  * the processing of samples per mmap smooth.
  */
@@ -1290,7 +1290,7 @@ kvm_events_report(struct perf_kvm_stat *kvm, int argc, const char **argv)
 	return kvm_events_report_vcpu(kvm);
 }
 
-#ifdef HAVE_TIMERFD_SUPPORT
+#if defined(HAVE_TIMERFD_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 static struct evlist *kvm_live_event_list(void)
 {
 	struct evlist *evlist;
@@ -1507,7 +1507,7 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
 	if (strlen(argv[1]) > 2 && strstarts("report", argv[1]))
 		return kvm_events_report(&kvm, argc - 1 , argv + 1);
 
-#ifdef HAVE_TIMERFD_SUPPORT
+#if defined(HAVE_TIMERFD_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 	if (!strncmp(argv[1], "live", 4))
 		return kvm_events_live(&kvm, argc - 1 , argv + 1);
 #endif
@@ -1644,7 +1644,7 @@ int cmd_kvm(int argc, const char **argv)
 		return cmd_top(argc, argv);
 	else if (strlen(argv[0]) > 2 && strstarts("buildid-list", argv[0]))
 		return __cmd_buildid_list(file_name, argc, argv);
-#ifdef HAVE_KVM_STAT_SUPPORT
+#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
 	else if (strlen(argv[0]) > 2 && strstarts("stat", argv[0]))
 		return kvm_cmd_stat(file_name, argc, argv);
 #endif
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index fb8c63656ad8..dc59d75180d1 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -6,10 +6,15 @@
  */
 
 #include "builtin.h"
+#include "perf.h"
 
 #include "util/data.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
+#include "util/header.h"
 #include "util/kwork.h"
 #include "util/debug.h"
+#include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/string2.h"
@@ -18,9 +23,11 @@
 
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
+#include <traceevent/event-parse.h>
 
 #include <errno.h>
 #include <inttypes.h>
+#include <signal.h>
 #include <linux/err.h>
 #include <linux/time64.h>
 #include <linux/zalloc.h>
@@ -216,7 +223,7 @@ static struct kwork_atom *atom_new(struct perf_kwork *kwork,
 	list_add_tail(&page->list, &kwork->atom_page_list);
 
 found_atom:
-	set_bit(i, page->bitmap);
+	__set_bit(i, page->bitmap);
 	atom->time = sample->time;
 	atom->prev = NULL;
 	atom->page_addr = page;
@@ -229,8 +236,8 @@ static void atom_free(struct kwork_atom *atom)
 	if (atom->prev != NULL)
 		atom_free(atom->prev);
 
-	clear_bit(atom->bit_inpage,
-		  ((struct kwork_atom_page *)atom->page_addr)->bitmap);
+	__clear_bit(atom->bit_inpage,
+		    ((struct kwork_atom_page *)atom->page_addr)->bitmap);
 }
 
 static void atom_del(struct kwork_atom *atom)
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 58e1ec1654ef..137d73edb541 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -15,34 +15,443 @@
 #include "util/pmu-hybrid.h"
 #include "util/debug.h"
 #include "util/metricgroup.h"
+#include "util/string2.h"
+#include "util/strlist.h"
+#include "util/strbuf.h"
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
+#include <linux/zalloc.h>
+#include <stdarg.h>
 #include <stdio.h>
 
-static bool desc_flag = true;
-static bool details_flag;
-static const char *hybrid_type;
+/**
+ * struct print_state - State and configuration passed to the default_print
+ * functions.
+ */
+struct print_state {
+	/**
+	 * @pmu_glob: Optionally restrict PMU and metric matching to PMU or
+	 * debugfs subsystem name.
+	 */
+	char *pmu_glob;
+	/** @event_glob: Optional pattern matching glob. */
+	char *event_glob;
+	/** @name_only: Print event or metric names only. */
+	bool name_only;
+	/** @desc: Print the event or metric description. */
+	bool desc;
+	/** @long_desc: Print longer event or metric description. */
+	bool long_desc;
+	/** @deprecated: Print deprecated events or metrics. */
+	bool deprecated;
+	/**
+	 * @detailed: Print extra information on the perf event such as names
+	 * and expressions used internally by events.
+	 */
+	bool detailed;
+	/** @metrics: Controls printing of metric and metric groups. */
+	bool metrics;
+	/** @metricgroups: Controls printing of metric and metric groups. */
+	bool metricgroups;
+	/** @last_topic: The last printed event topic. */
+	char *last_topic;
+	/** @last_metricgroups: The last printed metric group. */
+	char *last_metricgroups;
+	/** @visited_metrics: Metrics that are printed to avoid duplicates. */
+	struct strlist *visited_metrics;
+};
+
+static void default_print_start(void *ps)
+{
+	struct print_state *print_state = ps;
+
+	if (!print_state->name_only && pager_in_use())
+		printf("\nList of pre-defined events (to be used in -e or -M):\n\n");
+}
+
+static void default_print_end(void *print_state __maybe_unused) {}
+
+static void wordwrap(const char *s, int start, int max, int corr)
+{
+	int column = start;
+	int n;
+	bool saw_newline = false;
+
+	while (*s) {
+		int wlen = strcspn(s, " \t\n");
+
+		if ((column + wlen >= max && column > start) || saw_newline) {
+			printf("\n%*s", start, "");
+			column = start + corr;
+		}
+		n = printf("%s%.*s", column > start ? " " : "", wlen, s);
+		if (n <= 0)
+			break;
+		saw_newline = s[wlen] == '\n';
+		s += wlen;
+		column += n;
+		s = skip_spaces(s);
+	}
+}
+
+static void default_print_event(void *ps, const char *pmu_name, const char *topic,
+				const char *event_name, const char *event_alias,
+				const char *scale_unit __maybe_unused,
+				bool deprecated, const char *event_type_desc,
+				const char *desc, const char *long_desc,
+				const char *encoding_desc,
+				const char *metric_name, const char *metric_expr)
+{
+	struct print_state *print_state = ps;
+	int pos;
+
+	if (deprecated && !print_state->deprecated)
+		return;
+
+	if (print_state->pmu_glob && pmu_name && !strglobmatch(pmu_name, print_state->pmu_glob))
+		return;
+
+	if (print_state->event_glob &&
+	    (!event_name || !strglobmatch(event_name, print_state->event_glob)) &&
+	    (!event_alias || !strglobmatch(event_alias, print_state->event_glob)) &&
+	    (!topic || !strglobmatch_nocase(topic, print_state->event_glob)))
+		return;
+
+	if (print_state->name_only) {
+		if (event_alias && strlen(event_alias))
+			printf("%s ", event_alias);
+		else
+			printf("%s ", event_name);
+		return;
+	}
+
+	if (strcmp(print_state->last_topic, topic ?: "")) {
+		if (topic)
+			printf("\n%s:\n", topic);
+		free(print_state->last_topic);
+		print_state->last_topic = strdup(topic ?: "");
+	}
+
+	if (event_alias && strlen(event_alias))
+		pos = printf("  %s OR %s", event_name, event_alias);
+	else
+		pos = printf("  %s", event_name);
+
+	if (!topic && event_type_desc) {
+		for (; pos < 53; pos++)
+			putchar(' ');
+		printf("[%s]\n", event_type_desc);
+	} else
+		putchar('\n');
+
+	if (desc && print_state->desc) {
+		printf("%*s", 8, "[");
+		wordwrap(desc, 8, pager_get_columns(), 0);
+		printf("]\n");
+	}
+	long_desc = long_desc ?: desc;
+	if (long_desc && print_state->long_desc) {
+		printf("%*s", 8, "[");
+		wordwrap(long_desc, 8, pager_get_columns(), 0);
+		printf("]\n");
+	}
+
+	if (print_state->detailed && encoding_desc) {
+		printf("%*s", 8, "");
+		wordwrap(encoding_desc, 8, pager_get_columns(), 0);
+		if (metric_name)
+			printf(" MetricName: %s", metric_name);
+		if (metric_expr)
+			printf(" MetricExpr: %s", metric_expr);
+		putchar('\n');
+	}
+}
+
+static void default_print_metric(void *ps,
+				const char *group,
+				const char *name,
+				const char *desc,
+				const char *long_desc,
+				const char *expr,
+				const char *unit __maybe_unused)
+{
+	struct print_state *print_state = ps;
+
+	if (print_state->event_glob &&
+	    (!print_state->metrics || !name || !strglobmatch(name, print_state->event_glob)) &&
+	    (!print_state->metricgroups || !group || !strglobmatch(group, print_state->event_glob)))
+		return;
+
+	if (!print_state->name_only && !print_state->last_metricgroups) {
+		if (print_state->metricgroups) {
+			printf("\nMetric Groups:\n");
+			if (!print_state->metrics)
+				putchar('\n');
+		} else {
+			printf("\nMetrics:\n\n");
+		}
+	}
+	if (!print_state->last_metricgroups ||
+	    strcmp(print_state->last_metricgroups, group ?: "")) {
+		if (group && print_state->metricgroups) {
+			if (print_state->name_only)
+				printf("%s ", group);
+			else if (print_state->metrics)
+				printf("\n%s:\n", group);
+			else
+				printf("%s\n", group);
+		}
+		free(print_state->last_metricgroups);
+		print_state->last_metricgroups = strdup(group ?: "");
+	}
+	if (!print_state->metrics)
+		return;
+
+	if (print_state->name_only) {
+		if (print_state->metrics &&
+		    !strlist__has_entry(print_state->visited_metrics, name)) {
+			printf("%s ", name);
+			strlist__add(print_state->visited_metrics, name);
+		}
+		return;
+	}
+	printf("  %s\n", name);
+
+	if (desc && print_state->desc) {
+		printf("%*s", 8, "[");
+		wordwrap(desc, 8, pager_get_columns(), 0);
+		printf("]\n");
+	}
+	if (long_desc && print_state->long_desc) {
+		printf("%*s", 8, "[");
+		wordwrap(long_desc, 8, pager_get_columns(), 0);
+		printf("]\n");
+	}
+	if (expr && print_state->detailed) {
+		printf("%*s", 8, "[");
+		wordwrap(expr, 8, pager_get_columns(), 0);
+		printf("]\n");
+	}
+}
+
+struct json_print_state {
+	/** Should a separator be printed prior to the next item? */
+	bool need_sep;
+};
+
+static void json_print_start(void *print_state __maybe_unused)
+{
+	printf("[\n");
+}
+
+static void json_print_end(void *ps)
+{
+	struct json_print_state *print_state = ps;
+
+	printf("%s]\n", print_state->need_sep ? "\n" : "");
+}
+
+static void fix_escape_printf(struct strbuf *buf, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	strbuf_setlen(buf, 0);
+	for (size_t fmt_pos = 0; fmt_pos < strlen(fmt); fmt_pos++) {
+		switch (fmt[fmt_pos]) {
+		case '%':
+			fmt_pos++;
+			switch (fmt[fmt_pos]) {
+			case 's': {
+				const char *s = va_arg(args, const char*);
+
+				strbuf_addstr(buf, s);
+				break;
+			}
+			case 'S': {
+				const char *s = va_arg(args, const char*);
+
+				for (size_t s_pos = 0; s_pos < strlen(s); s_pos++) {
+					switch (s[s_pos]) {
+					case '\n':
+						strbuf_addstr(buf, "\\n");
+						break;
+					case '\\':
+						__fallthrough;
+					case '\"':
+						strbuf_addch(buf, '\\');
+						__fallthrough;
+					default:
+						strbuf_addch(buf, s[s_pos]);
+						break;
+					}
+				}
+				break;
+			}
+			default:
+				pr_err("Unexpected format character '%c'\n", fmt[fmt_pos]);
+				strbuf_addch(buf, '%');
+				strbuf_addch(buf, fmt[fmt_pos]);
+			}
+			break;
+		default:
+			strbuf_addch(buf, fmt[fmt_pos]);
+			break;
+		}
+	}
+	va_end(args);
+	fputs(buf->buf, stdout);
+}
+
+static void json_print_event(void *ps, const char *pmu_name, const char *topic,
+			     const char *event_name, const char *event_alias,
+			     const char *scale_unit,
+			     bool deprecated, const char *event_type_desc,
+			     const char *desc, const char *long_desc,
+			     const char *encoding_desc,
+			     const char *metric_name, const char *metric_expr)
+{
+	struct json_print_state *print_state = ps;
+	bool need_sep = false;
+	struct strbuf buf;
+
+	strbuf_init(&buf, 0);
+	printf("%s{\n", print_state->need_sep ? ",\n" : "");
+	print_state->need_sep = true;
+	if (pmu_name) {
+		fix_escape_printf(&buf, "\t\"Unit\": \"%S\"", pmu_name);
+		need_sep = true;
+	}
+	if (topic) {
+		fix_escape_printf(&buf, "%s\t\"Topic\": \"%S\"", need_sep ? ",\n" : "", topic);
+		need_sep = true;
+	}
+	if (event_name) {
+		fix_escape_printf(&buf, "%s\t\"EventName\": \"%S\"", need_sep ? ",\n" : "",
+				  event_name);
+		need_sep = true;
+	}
+	if (event_alias && strlen(event_alias)) {
+		fix_escape_printf(&buf, "%s\t\"EventAlias\": \"%S\"", need_sep ? ",\n" : "",
+				  event_alias);
+		need_sep = true;
+	}
+	if (scale_unit && strlen(scale_unit)) {
+		fix_escape_printf(&buf, "%s\t\"ScaleUnit\": \"%S\"", need_sep ? ",\n" : "",
+				  scale_unit);
+		need_sep = true;
+	}
+	if (event_type_desc) {
+		fix_escape_printf(&buf, "%s\t\"EventType\": \"%S\"", need_sep ? ",\n" : "",
+				  event_type_desc);
+		need_sep = true;
+	}
+	if (deprecated) {
+		fix_escape_printf(&buf, "%s\t\"Deprecated\": \"%S\"", need_sep ? ",\n" : "",
+				  deprecated ? "1" : "0");
+		need_sep = true;
+	}
+	if (desc) {
+		fix_escape_printf(&buf, "%s\t\"BriefDescription\": \"%S\"", need_sep ? ",\n" : "",
+				  desc);
+		need_sep = true;
+	}
+	if (long_desc) {
+		fix_escape_printf(&buf, "%s\t\"PublicDescription\": \"%S\"", need_sep ? ",\n" : "",
+				  long_desc);
+		need_sep = true;
+	}
+	if (encoding_desc) {
+		fix_escape_printf(&buf, "%s\t\"Encoding\": \"%S\"", need_sep ? ",\n" : "",
+				  encoding_desc);
+		need_sep = true;
+	}
+	if (metric_name) {
+		fix_escape_printf(&buf, "%s\t\"MetricName\": \"%S\"", need_sep ? ",\n" : "",
+				  metric_name);
+		need_sep = true;
+	}
+	if (metric_expr) {
+		fix_escape_printf(&buf, "%s\t\"MetricExpr\": \"%S\"", need_sep ? ",\n" : "",
+				  metric_expr);
+		need_sep = true;
+	}
+	printf("%s}", need_sep ? "\n" : "");
+	strbuf_release(&buf);
+}
+
+static void json_print_metric(void *ps __maybe_unused, const char *group,
+			      const char *name, const char *desc,
+			      const char *long_desc, const char *expr,
+			      const char *unit)
+{
+	struct json_print_state *print_state = ps;
+	bool need_sep = false;
+	struct strbuf buf;
+
+	strbuf_init(&buf, 0);
+	printf("%s{\n", print_state->need_sep ? ",\n" : "");
+	print_state->need_sep = true;
+	if (group) {
+		fix_escape_printf(&buf, "\t\"MetricGroup\": \"%S\"", group);
+		need_sep = true;
+	}
+	if (name) {
+		fix_escape_printf(&buf, "%s\t\"MetricName\": \"%S\"", need_sep ? ",\n" : "", name);
+		need_sep = true;
+	}
+	if (expr) {
+		fix_escape_printf(&buf, "%s\t\"MetricExpr\": \"%S\"", need_sep ? ",\n" : "", expr);
+		need_sep = true;
+	}
+	if (unit) {
+		fix_escape_printf(&buf, "%s\t\"ScaleUnit\": \"%S\"", need_sep ? ",\n" : "", unit);
+		need_sep = true;
+	}
+	if (desc) {
+		fix_escape_printf(&buf, "%s\t\"BriefDescription\": \"%S\"", need_sep ? ",\n" : "",
+				  desc);
+		need_sep = true;
+	}
+	if (long_desc) {
+		fix_escape_printf(&buf, "%s\t\"PublicDescription\": \"%S\"", need_sep ? ",\n" : "",
+				  long_desc);
+		need_sep = true;
+	}
+	printf("%s}", need_sep ? "\n" : "");
+	strbuf_release(&buf);
+}
 
 int cmd_list(int argc, const char **argv)
 {
 	int i, ret = 0;
-	bool raw_dump = false;
-	bool long_desc_flag = false;
-	bool deprecated = false;
-	char *pmu_name = NULL;
+	struct print_state default_ps = {};
+	struct print_state json_ps = {};
+	void *ps = &default_ps;
+	struct print_callbacks print_cb = {
+		.print_start = default_print_start,
+		.print_end = default_print_end,
+		.print_event = default_print_event,
+		.print_metric = default_print_metric,
+	};
+	const char *hybrid_name = NULL;
+	const char *unit_name = NULL;
+	bool json = false;
 	struct option list_options[] = {
-		OPT_BOOLEAN(0, "raw-dump", &raw_dump, "Dump raw events"),
-		OPT_BOOLEAN('d', "desc", &desc_flag,
+		OPT_BOOLEAN(0, "raw-dump", &default_ps.name_only, "Dump raw events"),
+		OPT_BOOLEAN('j', "json", &json, "JSON encode events and metrics"),
+		OPT_BOOLEAN('d', "desc", &default_ps.desc,
 			    "Print extra event descriptions. --no-desc to not print."),
-		OPT_BOOLEAN('v', "long-desc", &long_desc_flag,
+		OPT_BOOLEAN('v', "long-desc", &default_ps.long_desc,
 			    "Print longer event descriptions."),
-		OPT_BOOLEAN(0, "details", &details_flag,
+		OPT_BOOLEAN(0, "details", &default_ps.detailed,
 			    "Print information on the perf event names and expressions used internally by events."),
-		OPT_BOOLEAN(0, "deprecated", &deprecated,
+		OPT_BOOLEAN(0, "deprecated", &default_ps.deprecated,
 			    "Print deprecated events."),
-		OPT_STRING(0, "cputype", &hybrid_type, "hybrid cpu type",
-			   "Print events applying cpu with this type for hybrid platform "
-			   "(e.g. core or atom)"),
+		OPT_STRING(0, "cputype", &hybrid_name, "hybrid cpu type",
+			   "Limit PMU or metric printing to the given hybrid PMU (e.g. core or atom)."),
+		OPT_STRING(0, "unit", &unit_name, "PMU name",
+			   "Limit PMU or metric printing to the specified PMU."),
 		OPT_INCR(0, "debug", &verbose,
 			     "Enable debugging output"),
 		OPT_END()
@@ -53,24 +462,45 @@ int cmd_list(int argc, const char **argv)
 	};
 
 	set_option_flag(list_options, 0, "raw-dump", PARSE_OPT_HIDDEN);
+	/* Hide hybrid flag for the more generic 'unit' flag. */
+	set_option_flag(list_options, 0, "cputype", PARSE_OPT_HIDDEN);
 
 	argc = parse_options(argc, argv, list_options, list_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 
 	setup_pager();
 
-	if (!raw_dump && pager_in_use())
-		printf("\nList of pre-defined events (to be used in -e or -M):\n\n");
+	if (!default_ps.name_only)
+		setup_pager();
 
-	if (hybrid_type) {
-		pmu_name = perf_pmu__hybrid_type_to_pmu(hybrid_type);
-		if (!pmu_name)
-			pr_warning("WARNING: hybrid cputype is not supported!\n");
+	if (json) {
+		print_cb = (struct print_callbacks){
+			.print_start = json_print_start,
+			.print_end = json_print_end,
+			.print_event = json_print_event,
+			.print_metric = json_print_metric,
+		};
+		ps = &json_ps;
+	} else {
+		default_ps.desc = !default_ps.long_desc;
+		default_ps.last_topic = strdup("");
+		assert(default_ps.last_topic);
+		default_ps.visited_metrics = strlist__new(NULL, NULL);
+		assert(default_ps.visited_metrics);
+		if (unit_name)
+			default_ps.pmu_glob = strdup(unit_name);
+		else if (hybrid_name) {
+			default_ps.pmu_glob = perf_pmu__hybrid_type_to_pmu(hybrid_name);
+			if (!default_ps.pmu_glob)
+				pr_warning("WARNING: hybrid cputype is not supported!\n");
+		}
 	}
+	print_cb.print_start(ps);
 
 	if (argc == 0) {
-		print_events(NULL, raw_dump, !desc_flag, long_desc_flag,
-				details_flag, deprecated, pmu_name);
+		default_ps.metrics = true;
+		default_ps.metricgroups = true;
+		print_events(&print_cb, ps);
 		goto out;
 	}
 
@@ -78,68 +508,75 @@ int cmd_list(int argc, const char **argv)
 		char *sep, *s;
 
 		if (strcmp(argv[i], "tracepoint") == 0)
-			print_tracepoint_events(NULL, NULL, raw_dump);
+			print_tracepoint_events(&print_cb, ps);
 		else if (strcmp(argv[i], "hw") == 0 ||
 			 strcmp(argv[i], "hardware") == 0)
-			print_symbol_events(NULL, PERF_TYPE_HARDWARE,
-					event_symbols_hw, PERF_COUNT_HW_MAX, raw_dump);
+			print_symbol_events(&print_cb, ps, PERF_TYPE_HARDWARE,
+					event_symbols_hw, PERF_COUNT_HW_MAX);
 		else if (strcmp(argv[i], "sw") == 0 ||
 			 strcmp(argv[i], "software") == 0) {
-			print_symbol_events(NULL, PERF_TYPE_SOFTWARE,
-					event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
-			print_tool_events(NULL, raw_dump);
+			print_symbol_events(&print_cb, ps, PERF_TYPE_SOFTWARE,
+					event_symbols_sw, PERF_COUNT_SW_MAX);
+			print_tool_events(&print_cb, ps);
 		} else if (strcmp(argv[i], "cache") == 0 ||
 			 strcmp(argv[i], "hwcache") == 0)
-			print_hwcache_events(NULL, raw_dump);
+			print_hwcache_events(&print_cb, ps);
 		else if (strcmp(argv[i], "pmu") == 0)
-			print_pmu_events(NULL, raw_dump, !desc_flag,
-						long_desc_flag, details_flag,
-						deprecated, pmu_name);
+			print_pmu_events(&print_cb, ps);
 		else if (strcmp(argv[i], "sdt") == 0)
-			print_sdt_events(NULL, NULL, raw_dump);
-		else if (strcmp(argv[i], "metric") == 0 || strcmp(argv[i], "metrics") == 0)
-			metricgroup__print(true, false, NULL, raw_dump, details_flag, pmu_name);
-		else if (strcmp(argv[i], "metricgroup") == 0 || strcmp(argv[i], "metricgroups") == 0)
-			metricgroup__print(false, true, NULL, raw_dump, details_flag, pmu_name);
-		else if ((sep = strchr(argv[i], ':')) != NULL) {
-			int sep_idx;
-
-			sep_idx = sep - argv[i];
-			s = strdup(argv[i]);
-			if (s == NULL) {
+			print_sdt_events(&print_cb, ps);
+		else if (strcmp(argv[i], "metric") == 0 || strcmp(argv[i], "metrics") == 0) {
+			default_ps.metricgroups = false;
+			default_ps.metrics = true;
+			metricgroup__print(&print_cb, ps);
+		} else if (strcmp(argv[i], "metricgroup") == 0 ||
+			   strcmp(argv[i], "metricgroups") == 0) {
+			default_ps.metricgroups = true;
+			default_ps.metrics = false;
+			metricgroup__print(&print_cb, ps);
+		} else if ((sep = strchr(argv[i], ':')) != NULL) {
+			char *old_pmu_glob = default_ps.pmu_glob;
+
+			default_ps.event_glob = strdup(argv[i]);
+			if (!default_ps.event_glob) {
 				ret = -1;
 				goto out;
 			}
 
-			s[sep_idx] = '\0';
-			print_tracepoint_events(s, s + sep_idx + 1, raw_dump);
-			print_sdt_events(s, s + sep_idx + 1, raw_dump);
-			metricgroup__print(true, true, s, raw_dump, details_flag, pmu_name);
-			free(s);
+			print_tracepoint_events(&print_cb, ps);
+			print_sdt_events(&print_cb, ps);
+			default_ps.metrics = true;
+			default_ps.metricgroups = true;
+			metricgroup__print(&print_cb, ps);
+			zfree(&default_ps.event_glob);
+			default_ps.pmu_glob = old_pmu_glob;
 		} else {
 			if (asprintf(&s, "*%s*", argv[i]) < 0) {
 				printf("Critical: Not enough memory! Trying to continue...\n");
 				continue;
 			}
-			print_symbol_events(s, PERF_TYPE_HARDWARE,
-					    event_symbols_hw, PERF_COUNT_HW_MAX, raw_dump);
-			print_symbol_events(s, PERF_TYPE_SOFTWARE,
-					    event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
-			print_tool_events(s, raw_dump);
-			print_hwcache_events(s, raw_dump);
-			print_pmu_events(s, raw_dump, !desc_flag,
-						long_desc_flag,
-						details_flag,
-						deprecated,
-						pmu_name);
-			print_tracepoint_events(NULL, s, raw_dump);
-			print_sdt_events(NULL, s, raw_dump);
-			metricgroup__print(true, true, s, raw_dump, details_flag, pmu_name);
+			default_ps.event_glob = s;
+			print_symbol_events(&print_cb, ps, PERF_TYPE_HARDWARE,
+					event_symbols_hw, PERF_COUNT_HW_MAX);
+			print_symbol_events(&print_cb, ps, PERF_TYPE_SOFTWARE,
+					event_symbols_sw, PERF_COUNT_SW_MAX);
+			print_tool_events(&print_cb, ps);
+			print_hwcache_events(&print_cb, ps);
+			print_pmu_events(&print_cb, ps);
+			print_tracepoint_events(&print_cb, ps);
+			print_sdt_events(&print_cb, ps);
+			default_ps.metrics = true;
+			default_ps.metricgroups = true;
+			metricgroup__print(&print_cb, ps);
 			free(s);
 		}
 	}
 
 out:
-	free(pmu_name);
+	print_cb.print_end(ps);
+	free(default_ps.pmu_glob);
+	free(default_ps.last_topic);
+	free(default_ps.last_metricgroups);
+	strlist__delete(default_ps.visited_metrics);
 	return ret;
 }
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 9722d4ab2e55..25c0a5e5051f 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -12,6 +12,7 @@
 #include "util/target.h"
 #include "util/callchain.h"
 #include "util/lock-contention.h"
+#include "util/bpf_skel/lock_data.h"
 
 #include <subcmd/pager.h>
 #include <subcmd/parse-options.h>
@@ -24,6 +25,7 @@
 #include "util/data.h"
 #include "util/string2.h"
 #include "util/map.h"
+#include "util/util.h"
 
 #include <sys/types.h>
 #include <sys/prctl.h>
@@ -54,22 +56,14 @@ static struct rb_root		thread_stats;
 
 static bool combine_locks;
 static bool show_thread_stats;
+static bool show_lock_addrs;
 static bool use_bpf;
 static unsigned long bpf_map_entries = 10240;
 static int max_stack_depth = CONTENTION_STACK_DEPTH;
 static int stack_skip = CONTENTION_STACK_SKIP;
 static int print_nr_entries = INT_MAX / 2;
 
-static enum {
-	LOCK_AGGR_ADDR,
-	LOCK_AGGR_TASK,
-	LOCK_AGGR_CALLER,
-} aggr_mode = LOCK_AGGR_ADDR;
-
-static u64 sched_text_start;
-static u64 sched_text_end;
-static u64 lock_text_start;
-static u64 lock_text_end;
+static enum lock_aggr_mode aggr_mode = LOCK_AGGR_ADDR;
 
 static struct thread_stat *thread_stat_find(u32 tid)
 {
@@ -853,55 +847,6 @@ end:
 	return 0;
 }
 
-bool is_lock_function(struct machine *machine, u64 addr)
-{
-	if (!sched_text_start) {
-		struct map *kmap;
-		struct symbol *sym;
-
-		sym = machine__find_kernel_symbol_by_name(machine,
-							  "__sched_text_start",
-							  &kmap);
-		if (!sym) {
-			/* to avoid retry */
-			sched_text_start = 1;
-			return false;
-		}
-
-		sched_text_start = kmap->unmap_ip(kmap, sym->start);
-
-		/* should not fail from here */
-		sym = machine__find_kernel_symbol_by_name(machine,
-							  "__sched_text_end",
-							  &kmap);
-		sched_text_end = kmap->unmap_ip(kmap, sym->start);
-
-		sym = machine__find_kernel_symbol_by_name(machine,
-							  "__lock_text_start",
-							  &kmap);
-		lock_text_start = kmap->unmap_ip(kmap, sym->start);
-
-		sym = machine__find_kernel_symbol_by_name(machine,
-							  "__lock_text_end",
-							  &kmap);
-		lock_text_end = kmap->unmap_ip(kmap, sym->start);
-	}
-
-	/* failed to get kernel symbols */
-	if (sched_text_start == 1)
-		return false;
-
-	/* mutex and rwsem functions are in sched text section */
-	if (sched_text_start <= addr && addr < sched_text_end)
-		return true;
-
-	/* spinlock functions are in lock text section */
-	if (lock_text_start <= addr && addr < lock_text_end)
-		return true;
-
-	return false;
-}
-
 static int get_symbol_name_offset(struct map *map, struct symbol *sym, u64 ip,
 				  char *buf, int size)
 {
@@ -960,7 +905,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl
 			goto next;
 
 		sym = node->ms.sym;
-		if (sym && !is_lock_function(machine, node->ip)) {
+		if (sym && !machine__is_lock_function(machine, node->ip)) {
 			get_symbol_name_offset(node->ms.map, sym, node->ip,
 					       buf, size);
 			return 0;
@@ -1006,7 +951,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample)
 		if (++skip <= stack_skip)
 			goto next;
 
-		if (node->ms.sym && is_lock_function(machine, node->ip))
+		if (node->ms.sym && machine__is_lock_function(machine, node->ip))
 			goto next;
 
 		hash ^= hash_long((unsigned long)node->ip, 64);
@@ -1055,13 +1000,32 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
 	ls = lock_stat_find(key);
 	if (!ls) {
 		char buf[128];
-		const char *caller = buf;
+		const char *name = "";
 		unsigned int flags = evsel__intval(evsel, sample, "flags");
+		struct machine *machine = &session->machines.host;
+		struct map *kmap;
+		struct symbol *sym;
+
+		switch (aggr_mode) {
+		case LOCK_AGGR_ADDR:
+			/* make sure it loads the kernel map to find lock symbols */
+			map__load(machine__kernel_map(machine));
 
-		if (lock_contention_caller(evsel, sample, buf, sizeof(buf)) < 0)
-			caller = "Unknown";
+			sym = machine__find_kernel_symbol(machine, key, &kmap);
+			if (sym)
+				name = sym->name;
+			break;
+		case LOCK_AGGR_CALLER:
+			name = buf;
+			if (lock_contention_caller(evsel, sample, buf, sizeof(buf)) < 0)
+				name = "Unknown";
+			break;
+		case LOCK_AGGR_TASK:
+		default:
+			break;
+		}
 
-		ls = lock_stat_findnew(key, caller, flags);
+		ls = lock_stat_findnew(key, name, flags);
 		if (!ls)
 			return -ENOMEM;
 
@@ -1389,6 +1353,34 @@ static int dump_info(void)
 	return rc;
 }
 
+static const struct evsel_str_handler lock_tracepoints[] = {
+	{ "lock:lock_acquire",	 evsel__process_lock_acquire,   }, /* CONFIG_LOCKDEP */
+	{ "lock:lock_acquired",	 evsel__process_lock_acquired,  }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
+	{ "lock:lock_contended", evsel__process_lock_contended, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
+	{ "lock:lock_release",	 evsel__process_lock_release,   }, /* CONFIG_LOCKDEP */
+};
+
+static const struct evsel_str_handler contention_tracepoints[] = {
+	{ "lock:contention_begin", evsel__process_contention_begin, },
+	{ "lock:contention_end",   evsel__process_contention_end,   },
+};
+
+static int process_event_update(struct perf_tool *tool,
+				union perf_event *event,
+				struct evlist **pevlist)
+{
+	int ret;
+
+	ret = perf_event__process_event_update(tool, event, pevlist);
+	if (ret < 0)
+		return ret;
+
+	/* this can return -EEXIST since we call it for each evsel */
+	perf_session__set_tracepoints_handlers(session, lock_tracepoints);
+	perf_session__set_tracepoints_handlers(session, contention_tracepoints);
+	return 0;
+}
+
 typedef int (*tracepoint_handler)(struct evsel *evsel,
 				  struct perf_sample *sample);
 
@@ -1488,10 +1480,19 @@ static void print_contention_result(struct lock_contention *con)
 		list_for_each_entry(key, &lock_keys, list)
 			pr_info("%*s ", key->len, key->header);
 
-		if (show_thread_stats)
+		switch (aggr_mode) {
+		case LOCK_AGGR_TASK:
 			pr_info("  %10s   %s\n\n", "pid", "comm");
-		else
+			break;
+		case LOCK_AGGR_CALLER:
 			pr_info("  %10s   %s\n\n", "type", "caller");
+			break;
+		case LOCK_AGGR_ADDR:
+			pr_info("  %16s   %s\n\n", "address", "symbol");
+			break;
+		default:
+			break;
+		}
 	}
 
 	bad = total = printed = 0;
@@ -1499,6 +1500,9 @@ static void print_contention_result(struct lock_contention *con)
 		bad = bad_hist[BROKEN_CONTENDED];
 
 	while ((st = pop_from_result())) {
+		struct thread *t;
+		int pid;
+
 		total += use_bpf ? st->nr_contended : 1;
 		if (st->broken)
 			bad++;
@@ -1508,18 +1512,24 @@ static void print_contention_result(struct lock_contention *con)
 			pr_info(" ");
 		}
 
-		if (show_thread_stats) {
-			struct thread *t;
-			int pid = st->addr;
-
-			/* st->addr contains tid of thread */
+		switch (aggr_mode) {
+		case LOCK_AGGR_CALLER:
+			pr_info("  %10s   %s\n", get_type_str(st), st->name);
+			break;
+		case LOCK_AGGR_TASK:
+			pid = st->addr;
 			t = perf_session__findnew(session, pid);
 			pr_info("  %10d   %s\n", pid, thread__comm_str(t));
-			goto next;
+			break;
+		case LOCK_AGGR_ADDR:
+			pr_info("  %016llx   %s\n", (unsigned long long)st->addr,
+				st->name ? : "");
+			break;
+		default:
+			break;
 		}
 
-		pr_info("  %10s   %s\n", get_type_str(st), st->name);
-		if (verbose) {
+		if (aggr_mode == LOCK_AGGR_CALLER && verbose) {
 			struct map *kmap;
 			struct symbol *sym;
 			char buf[128];
@@ -1536,7 +1546,6 @@ static void print_contention_result(struct lock_contention *con)
 			}
 		}
 
-next:
 		if (++printed >= print_nr_entries)
 			break;
 	}
@@ -1544,28 +1553,19 @@ next:
 	print_bad_events(bad, total);
 }
 
-static const struct evsel_str_handler lock_tracepoints[] = {
-	{ "lock:lock_acquire",	 evsel__process_lock_acquire,   }, /* CONFIG_LOCKDEP */
-	{ "lock:lock_acquired",	 evsel__process_lock_acquired,  }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
-	{ "lock:lock_contended", evsel__process_lock_contended, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
-	{ "lock:lock_release",	 evsel__process_lock_release,   }, /* CONFIG_LOCKDEP */
-};
-
-static const struct evsel_str_handler contention_tracepoints[] = {
-	{ "lock:contention_begin", evsel__process_contention_begin, },
-	{ "lock:contention_end",   evsel__process_contention_end,   },
-};
-
 static bool force;
 
 static int __cmd_report(bool display_info)
 {
 	int err = -EINVAL;
 	struct perf_tool eops = {
+		.attr		 = perf_event__process_attr,
+		.event_update	 = process_event_update,
 		.sample		 = process_sample_event,
 		.comm		 = perf_event__process_comm,
 		.mmap		 = perf_event__process_mmap,
 		.namespaces	 = perf_event__process_namespaces,
+		.tracing_data	 = perf_event__process_tracing_data,
 		.ordered_events	 = true,
 	};
 	struct perf_data data = {
@@ -1584,17 +1584,19 @@ static int __cmd_report(bool display_info)
 	symbol_conf.sort_by_name = true;
 	symbol__init(&session->header.env);
 
-	if (!perf_session__has_traces(session, "lock record"))
-		goto out_delete;
+	if (!data.is_pipe) {
+		if (!perf_session__has_traces(session, "lock record"))
+			goto out_delete;
 
-	if (perf_session__set_tracepoints_handlers(session, lock_tracepoints)) {
-		pr_err("Initializing perf session tracepoint handlers failed\n");
-		goto out_delete;
-	}
+		if (perf_session__set_tracepoints_handlers(session, lock_tracepoints)) {
+			pr_err("Initializing perf session tracepoint handlers failed\n");
+			goto out_delete;
+		}
 
-	if (perf_session__set_tracepoints_handlers(session, contention_tracepoints)) {
-		pr_err("Initializing perf session tracepoint handlers failed\n");
-		goto out_delete;
+		if (perf_session__set_tracepoints_handlers(session, contention_tracepoints)) {
+			pr_err("Initializing perf session tracepoint handlers failed\n");
+			goto out_delete;
+		}
 	}
 
 	if (setup_output_field(false, output_fields))
@@ -1632,9 +1634,12 @@ static int __cmd_contention(int argc, const char **argv)
 {
 	int err = -EINVAL;
 	struct perf_tool eops = {
+		.attr		 = perf_event__process_attr,
+		.event_update	 = process_event_update,
 		.sample		 = process_sample_event,
 		.comm		 = perf_event__process_comm,
 		.mmap		 = perf_event__process_mmap,
+		.tracing_data	 = perf_event__process_tracing_data,
 		.ordered_events	 = true,
 	};
 	struct perf_data data = {
@@ -1658,6 +1663,9 @@ static int __cmd_contention(int argc, const char **argv)
 
 	con.machine = &session->machines.host;
 
+	con.aggr_mode = aggr_mode = show_thread_stats ? LOCK_AGGR_TASK :
+		show_lock_addrs ? LOCK_AGGR_ADDR : LOCK_AGGR_CALLER;
+
 	/* for lock function check */
 	symbol_conf.sort_by_name = true;
 	symbol__init(&session->header.env);
@@ -1697,7 +1705,7 @@ static int __cmd_contention(int argc, const char **argv)
 			pr_err("lock contention BPF setup failed\n");
 			goto out_delete;
 		}
-	} else {
+	} else if (!data.is_pipe) {
 		if (!perf_session__has_traces(session, "lock record"))
 			goto out_delete;
 
@@ -1720,11 +1728,6 @@ static int __cmd_contention(int argc, const char **argv)
 	if (select_key(true))
 		goto out_delete;
 
-	if (show_thread_stats)
-		aggr_mode = LOCK_AGGR_TASK;
-	else
-		aggr_mode = LOCK_AGGR_CALLER;
-
 	if (use_bpf) {
 		lock_contention_start();
 		if (argc)
@@ -1858,6 +1861,29 @@ static int parse_map_entry(const struct option *opt, const char *str,
 	return 0;
 }
 
+static int parse_max_stack(const struct option *opt, const char *str,
+			   int unset __maybe_unused)
+{
+	unsigned long *len = (unsigned long *)opt->value;
+	long val;
+	char *endptr;
+
+	errno = 0;
+	val = strtol(str, &endptr, 0);
+	if (*endptr != '\0' || errno != 0) {
+		pr_err("invalid max stack depth: %s\n", str);
+		return -1;
+	}
+
+	if (val < 0 || val > sysctl__max_stack()) {
+		pr_err("invalid max stack depth: %ld\n", val);
+		return -1;
+	}
+
+	*len = val;
+	return 0;
+}
+
 int cmd_lock(int argc, const char **argv)
 {
 	const struct option lock_options[] = {
@@ -1869,7 +1895,7 @@ int cmd_lock(int argc, const char **argv)
 		   "file", "vmlinux pathname"),
 	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
 		   "file", "kallsyms pathname"),
-	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
+	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any warnings or messages"),
 	OPT_END()
 	};
 
@@ -1913,13 +1939,14 @@ int cmd_lock(int argc, const char **argv)
 		   "Trace on existing thread id (exclusive to --pid)"),
 	OPT_CALLBACK(0, "map-nr-entries", &bpf_map_entries, "num",
 		     "Max number of BPF map entries", parse_map_entry),
-	OPT_INTEGER(0, "max-stack", &max_stack_depth,
-		    "Set the maximum stack depth when collecting lock contention, "
-		    "Default: " __stringify(CONTENTION_STACK_DEPTH)),
+	OPT_CALLBACK(0, "max-stack", &max_stack_depth, "num",
+		     "Set the maximum stack depth when collecting lopck contention, "
+		     "Default: " __stringify(CONTENTION_STACK_DEPTH), parse_max_stack),
 	OPT_INTEGER(0, "stack-skip", &stack_skip,
 		    "Set the number of stack depth to skip when finding a lock caller, "
 		    "Default: " __stringify(CONTENTION_STACK_SKIP)),
 	OPT_INTEGER('E', "entries", &print_nr_entries, "display this many functions"),
+	OPT_BOOLEAN('l', "lock-addr", &show_lock_addrs, "show lock stats by address"),
 	OPT_PARENT(lock_options)
 	};
 
@@ -1989,6 +2016,16 @@ int cmd_lock(int argc, const char **argv)
 			argc = parse_options(argc, argv, contention_options,
 					     contention_usage, 0);
 		}
+
+		if (show_thread_stats && show_lock_addrs) {
+			pr_err("Cannot use thread and addr mode together\n");
+			parse_options_usage(contention_usage, contention_options,
+					    "threads", 0);
+			parse_options_usage(NULL, contention_options,
+					    "lock-addr", 0);
+			return -1;
+		}
+
 		rc = __cmd_contention(argc, argv);
 	} else {
 		usage_with_options(lock_usage, lock_options);
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 923fb8316fda..dedd612eae5e 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -20,6 +20,7 @@
 #include "util/symbol.h"
 #include "util/pmu.h"
 #include "util/pmu-hybrid.h"
+#include "util/sample.h"
 #include "util/string2.h"
 #include <linux/err.h>
 
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index f62298f5db3b..2ae50fc9e597 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -40,7 +40,6 @@ static struct {
 	int command;	/* Command short_name */
 	bool list_events;
 	bool uprobes;
-	bool quiet;
 	bool target_used;
 	int nevents;
 	struct perf_probe_event events[MAX_PROBES];
@@ -514,8 +513,8 @@ __cmd_probe(int argc, const char **argv)
 	struct option options[] = {
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show parsed arguments, etc)"),
-	OPT_BOOLEAN('q', "quiet", &params.quiet,
-		    "be quiet (do not show any messages)"),
+	OPT_BOOLEAN('q', "quiet", &quiet,
+		    "be quiet (do not show any warnings or messages)"),
 	OPT_CALLBACK_DEFAULT('l', "list", NULL, "[GROUP:]EVENT",
 			     "list up probe events",
 			     opt_set_filter_with_command, DEFAULT_LIST_FILTER),
@@ -634,7 +633,7 @@ __cmd_probe(int argc, const char **argv)
 	if (ret)
 		return ret;
 
-	if (params.quiet) {
+	if (quiet) {
 		if (verbose != 0) {
 			pr_err("  Error: -v and -q are exclusive.\n");
 			return -EINVAL;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e128b855ddde..8ecffa696ce3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -646,10 +646,10 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 	return record__write(rec, map, bf, size);
 }
 
-static volatile int signr = -1;
-static volatile int child_finished;
+static volatile sig_atomic_t signr = -1;
+static volatile sig_atomic_t child_finished;
 #ifdef HAVE_EVENTFD_SUPPORT
-static volatile int done_fd = -1;
+static volatile sig_atomic_t done_fd = -1;
 #endif
 
 static void sig_handler(int sig)
@@ -1701,8 +1701,10 @@ static void record__init_features(struct record *rec)
 	if (rec->no_buildid)
 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (!have_tracepoints(&rec->evlist->core.entries))
 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
+#endif
 
 	if (!rec->opts.branch_stack)
 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
@@ -1926,7 +1928,7 @@ static void record__read_lost_samples(struct record *rec)
 
 }
 
-static volatile int workload_exec_errno;
+static volatile sig_atomic_t workload_exec_errno;
 
 /*
  * evlist__prepare_workload will send a SIGUSR1
@@ -3378,8 +3380,6 @@ static struct option __record_options[] = {
 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
 		     record__mmap_flush_parse),
-	OPT_BOOLEAN(0, "group", &record.opts.group,
-		    "put the counters into a counter group"),
 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
 			   NULL, "enables call-graph recording" ,
 			   &record_callchain_opt),
@@ -3388,7 +3388,7 @@ static struct option __record_options[] = {
 		     &record_parse_callchain_opt),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
-	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
+	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
 		    "per thread counts"),
 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
@@ -3555,7 +3555,7 @@ static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cp
 		/* Return ENODEV is input cpu is greater than max cpu */
 		if ((unsigned long)cpu.cpu > mask->nbits)
 			return -ENODEV;
-		set_bit(cpu.cpu, mask->bits);
+		__set_bit(cpu.cpu, mask->bits);
 	}
 
 	return 0;
@@ -3627,8 +3627,8 @@ static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map
 	pr_debug("nr_threads: %d\n", rec->nr_threads);
 
 	for (t = 0; t < rec->nr_threads; t++) {
-		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
-		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
+		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
+		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
 		if (verbose) {
 			pr_debug("thread_masks[%d]: ", t);
 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 8361890176c2..2ee2ecca208e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -67,6 +67,10 @@
 #include <unistd.h>
 #include <linux/mman.h>
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 struct report {
 	struct perf_tool	tool;
 	struct perf_session	*session;
@@ -1199,7 +1203,9 @@ int cmd_report(int argc, const char **argv)
 			.lost		 = perf_event__process_lost,
 			.read		 = process_read_event,
 			.attr		 = process_attr,
+#ifdef HAVE_LIBTRACEEVENT
 			.tracing_data	 = perf_event__process_tracing_data,
+#endif
 			.build_id	 = perf_event__process_build_id,
 			.id_index	 = perf_event__process_id_index,
 			.auxtrace_info	 = perf_event__process_auxtrace_info,
@@ -1222,7 +1228,7 @@ int cmd_report(int argc, const char **argv)
 		    "input file name"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
+	OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any warnings or messages"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_BOOLEAN(0, "stats", &report.stats_mode, "Display event stats"),
@@ -1660,6 +1666,7 @@ repeat:
 						  report.range_num);
 	}
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (session->tevent.pevent &&
 	    tep_set_function_resolver(session->tevent.pevent,
 				      machine__resolve_kernel_addr,
@@ -1668,7 +1675,7 @@ repeat:
 		       __func__);
 		return -1;
 	}
-
+#endif
 	sort__setup_elide(stdout);
 
 	ret = __cmd_report(&report);
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f93737eef07b..86e18575c9be 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1573,7 +1573,7 @@ static int map_switch_event(struct perf_sched *sched, struct evsel *evsel,
 
 	if (sched->map.comp) {
 		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
-		if (!test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
+		if (!__test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
 			sched->map.comp_cpus[cpus_nr++] = this_cpu;
 			new_cpu = true;
 		}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 7ca238277d83..88888fb885c8 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -62,6 +62,9 @@
 #include "perf.h"
 
 #include <linux/ctype.h>
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
 
 static char const		*script_name;
 static char const		*generate_script_lang;
@@ -2049,7 +2052,7 @@ static void perf_sample__fprint_metric(struct perf_script *script,
 	u64 val;
 
 	if (!evsel->stats)
-		evlist__alloc_stats(script->session->evlist, false);
+		evlist__alloc_stats(&stat_config, script->session->evlist, /*alloc_raw=*/false);
 	if (evsel_script(leader)->gnum++ == 0)
 		perf_stat__reset_shadow_stats();
 	val = sample->period * evsel->scale;
@@ -2154,12 +2157,12 @@ static void process_event(struct perf_script *script,
 		perf_sample__fprintf_bts(sample, evsel, thread, al, addr_al, machine, fp);
 		return;
 	}
-
+#ifdef HAVE_LIBTRACEEVENT
 	if (PRINT_FIELD(TRACE) && sample->raw_data) {
 		event_format__fprintf(evsel->tp_format, sample->cpu,
 				      sample->raw_data, sample->raw_size, fp);
 	}
-
+#endif
 	if (attr->type == PERF_TYPE_SYNTH && PRINT_FIELD(SYNTH))
 		perf_sample__fprintf_synth(sample, evsel, fp);
 
@@ -2283,8 +2286,10 @@ static void process_stat_interval(u64 tstamp)
 
 static void setup_scripting(void)
 {
+#ifdef HAVE_LIBTRACEEVENT
 	setup_perl_scripting();
 	setup_python_scripting();
+#endif
 }
 
 static int flush_scripting(void)
@@ -3632,7 +3637,7 @@ static int set_maps(struct perf_script *script)
 
 	perf_evlist__set_maps(&evlist->core, script->cpus, script->threads);
 
-	if (evlist__alloc_stats(evlist, true))
+	if (evlist__alloc_stats(&stat_config, evlist, /*alloc_raw=*/true))
 		return -ENOMEM;
 
 	script->allocated = true;
@@ -3784,7 +3789,9 @@ int cmd_script(int argc, const char **argv)
 			.fork		 = perf_event__process_fork,
 			.attr		 = process_attr,
 			.event_update   = perf_event__process_event_update,
+#ifdef HAVE_LIBTRACEEVENT
 			.tracing_data	 = perf_event__process_tracing_data,
+#endif
 			.feature	 = process_feature_event,
 			.build_id	 = perf_event__process_build_id,
 			.id_index	 = perf_event__process_id_index,
@@ -4215,6 +4222,7 @@ script_found:
 	else
 		symbol_conf.use_callchain = false;
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (session->tevent.pevent &&
 	    tep_set_function_resolver(session->tevent.pevent,
 				      machine__resolve_kernel_addr,
@@ -4223,7 +4231,7 @@ script_found:
 		err = -1;
 		goto out_delete;
 	}
-
+#endif
 	if (generate_script_lang) {
 		struct stat perf_stat;
 		int input;
@@ -4259,9 +4267,12 @@ script_found:
 			err = -ENOENT;
 			goto out_delete;
 		}
-
+#ifdef HAVE_LIBTRACEEVENT
 		err = scripting_ops->generate_script(session->tevent.pevent,
 						     "perf-script");
+#else
+		err = scripting_ops->generate_script(NULL, "perf-script");
+#endif
 		goto out_delete;
 	}
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 265b05157972..bf640abc9c41 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -93,6 +93,7 @@
 
 #include <linux/ctype.h>
 #include <perf/evlist.h>
+#include <internal/threadmap.h>
 
 #define DEFAULT_SEPARATOR	" "
 #define FREEZE_ON_SMI_PATH	"devices/cpu/freeze_on_smi"
@@ -173,14 +174,13 @@ static struct target target = {
 
 #define METRIC_ONLY_LEN 20
 
-static volatile pid_t		child_pid			= -1;
+static volatile sig_atomic_t	child_pid			= -1;
 static int			detailed_run			=  0;
 static bool			transaction_run;
 static bool			topdown_run			= false;
 static bool			smi_cost			= false;
 static bool			smi_reset			= false;
 static int			big_num_opt			=  -1;
-static bool			group				= false;
 static const char		*pre_cmd			= NULL;
 static const char		*post_cmd			= NULL;
 static bool			sync_run			= false;
@@ -208,7 +208,7 @@ struct perf_stat {
 static struct perf_stat		perf_stat;
 #define STAT_RECORD		perf_stat.record
 
-static volatile int done = 0;
+static volatile sig_atomic_t done = 0;
 
 static struct perf_stat_config stat_config = {
 	.aggr_mode		= AGGR_GLOBAL,
@@ -465,15 +465,19 @@ static int read_bpf_map_counters(void)
 	return 0;
 }
 
-static void read_counters(struct timespec *rs)
+static int read_counters(struct timespec *rs)
 {
-	struct evsel *counter;
-
 	if (!stat_config.stop_read_counter) {
 		if (read_bpf_map_counters() ||
 		    read_affinity_counters(rs))
-			return;
+			return -1;
 	}
+	return 0;
+}
+
+static void process_counters(void)
+{
+	struct evsel *counter;
 
 	evlist__for_each_entry(evsel_list, counter) {
 		if (counter->err)
@@ -482,6 +486,10 @@ static void read_counters(struct timespec *rs)
 			pr_warning("failed to process counter %s\n", counter->name);
 		counter->err = 0;
 	}
+
+	perf_stat_merge_counters(&stat_config, evsel_list);
+	perf_stat_process_percore(&stat_config, evsel_list);
+	perf_stat_process_shadow_stats(&stat_config, evsel_list);
 }
 
 static void process_interval(void)
@@ -492,7 +500,10 @@ static void process_interval(void)
 	diff_timespec(&rs, &ts, &ref_time);
 
 	perf_stat__reset_shadow_per_stat(&rt_stat);
-	read_counters(&rs);
+	evlist__reset_aggr_stats(evsel_list);
+
+	if (read_counters(&rs) == 0)
+		process_counters();
 
 	if (STAT_RECORD) {
 		if (WRITE_STAT_ROUND_EVENT(rs.tv_sec * NSEC_PER_SEC + rs.tv_nsec, INTERVAL))
@@ -528,26 +539,14 @@ static int enable_counters(void)
 			return err;
 	}
 
-	if (stat_config.initial_delay < 0) {
-		pr_info(EVLIST_DISABLED_MSG);
-		return 0;
-	}
-
-	if (stat_config.initial_delay > 0) {
-		pr_info(EVLIST_DISABLED_MSG);
-		usleep(stat_config.initial_delay * USEC_PER_MSEC);
-	}
-
 	/*
 	 * We need to enable counters only if:
 	 * - we don't have tracee (attaching to task or cpu)
 	 * - we have initial delay configured
 	 */
-	if (!target__none(&target) || stat_config.initial_delay) {
+	if (!target__none(&target)) {
 		if (!all_counters_use_bpf)
 			evlist__enable(evsel_list);
-		if (stat_config.initial_delay > 0)
-			pr_info(EVLIST_ENABLED_MSG);
 	}
 	return 0;
 }
@@ -569,7 +568,7 @@ static void disable_counters(void)
 	}
 }
 
-static volatile int workload_exec_errno;
+static volatile sig_atomic_t workload_exec_errno;
 
 /*
  * evlist__prepare_workload will send a SIGUSR1
@@ -769,9 +768,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		child_pid = evsel_list->workload.pid;
 	}
 
-	if (group)
-		evlist__set_leader(evsel_list);
-
 	if (!cpu_map__is_dummy(evsel_list->core.user_requested_cpus)) {
 		if (affinity__setup(&saved_affinity) < 0)
 			return -1;
@@ -918,14 +914,27 @@ try_again_reset:
 			return err;
 	}
 
-	err = enable_counters();
-	if (err)
-		return -1;
+	if (stat_config.initial_delay) {
+		pr_info(EVLIST_DISABLED_MSG);
+	} else {
+		err = enable_counters();
+		if (err)
+			return -1;
+	}
 
 	/* Exec the command, if any */
 	if (forks)
 		evlist__start_workload(evsel_list);
 
+	if (stat_config.initial_delay > 0) {
+		usleep(stat_config.initial_delay * USEC_PER_MSEC);
+		err = enable_counters();
+		if (err)
+			return -1;
+
+		pr_info(EVLIST_ENABLED_MSG);
+	}
+
 	t0 = rdclock();
 	clock_gettime(CLOCK_MONOTONIC, &ref_time);
 
@@ -963,11 +972,9 @@ try_again_reset:
 		init_stats(&walltime_nsecs_stats);
 		update_stats(&walltime_nsecs_stats, t1 - t0);
 
-		if (stat_config.aggr_mode == AGGR_GLOBAL)
-			evlist__save_aggr_prev_raw_counts(evsel_list);
-
 		evlist__copy_prev_raw_counts(evsel_list);
 		evlist__reset_prev_raw_counts(evsel_list);
+		evlist__reset_aggr_stats(evsel_list);
 		perf_stat__reset_shadow_per_stat(&rt_stat);
 	} else {
 		update_stats(&walltime_nsecs_stats, t1 - t0);
@@ -980,7 +987,8 @@ try_again_reset:
 	 * avoid arbitrary skew, we must read all counters before closing any
 	 * group leaders.
 	 */
-	read_counters(&(struct timespec) { .tv_nsec = t1-t0 });
+	if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
+		process_counters();
 
 	/*
 	 * We need to keep evsel_list alive, because it's processed
@@ -1023,13 +1031,13 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
 	/* Do not print anything if we record to the pipe. */
 	if (STAT_RECORD && perf_stat.data.is_pipe)
 		return;
-	if (stat_config.quiet)
+	if (quiet)
 		return;
 
 	evlist__print_counters(evsel_list, &stat_config, &target, ts, argc, argv);
 }
 
-static volatile int signr = -1;
+static volatile sig_atomic_t signr = -1;
 
 static void skip_signal(int signo)
 {
@@ -1181,8 +1189,6 @@ static struct option stat_options[] = {
 #endif
 	OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
 		    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('g', "group", &group,
-		    "put the counters into a counter group"),
 	OPT_BOOLEAN(0, "scale", &stat_config.scale,
 		    "Use --no-scale to disable counter scaling for multiplexing"),
 	OPT_INCR('v', "verbose", &verbose,
@@ -1273,8 +1279,8 @@ static struct option stat_options[] = {
 		       "print summary for interval mode"),
 	OPT_BOOLEAN(0, "no-csv-summary", &stat_config.no_csv_summary,
 		       "don't print 'summary' for CSV summary output"),
-	OPT_BOOLEAN(0, "quiet", &stat_config.quiet,
-			"don't print output (useful with record)"),
+	OPT_BOOLEAN(0, "quiet", &quiet,
+			"don't print any output, messages or warnings (useful with record)"),
 	OPT_CALLBACK(0, "cputype", &evsel_list, "hybrid cpu type",
 		     "Only enable events on applying cpu with this type "
 		     "for hybrid platform (e.g. core or atom)",
@@ -1330,10 +1336,26 @@ static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config __
 	return aggr_cpu_id__node(cpu, /*data=*/NULL);
 }
 
+static struct aggr_cpu_id perf_stat__get_global(struct perf_stat_config *config __maybe_unused,
+						struct perf_cpu cpu)
+{
+	return aggr_cpu_id__global(cpu, /*data=*/NULL);
+}
+
+static struct aggr_cpu_id perf_stat__get_cpu(struct perf_stat_config *config __maybe_unused,
+					     struct perf_cpu cpu)
+{
+	return aggr_cpu_id__cpu(cpu, /*data=*/NULL);
+}
+
 static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config,
 					      aggr_get_id_t get_id, struct perf_cpu cpu)
 {
-	struct aggr_cpu_id id = aggr_cpu_id__empty();
+	struct aggr_cpu_id id;
+
+	/* per-process mode - should use global aggr mode */
+	if (cpu.cpu == -1)
+		return get_id(config, cpu);
 
 	if (aggr_cpu_id__is_empty(&config->cpus_aggr_map->map[cpu.cpu]))
 		config->cpus_aggr_map->map[cpu.cpu] = get_id(config, cpu);
@@ -1366,16 +1388,16 @@ static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config *co
 	return perf_stat__get_aggr(config, perf_stat__get_node, cpu);
 }
 
-static bool term_percore_set(void)
+static struct aggr_cpu_id perf_stat__get_global_cached(struct perf_stat_config *config,
+						       struct perf_cpu cpu)
 {
-	struct evsel *counter;
-
-	evlist__for_each_entry(evsel_list, counter) {
-		if (counter->percore)
-			return true;
-	}
+	return perf_stat__get_aggr(config, perf_stat__get_global, cpu);
+}
 
-	return false;
+static struct aggr_cpu_id perf_stat__get_cpu_cached(struct perf_stat_config *config,
+						    struct perf_cpu cpu)
+{
+	return perf_stat__get_aggr(config, perf_stat__get_cpu, cpu);
 }
 
 static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
@@ -1390,11 +1412,9 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode)
 	case AGGR_NODE:
 		return aggr_cpu_id__node;
 	case AGGR_NONE:
-		if (term_percore_set())
-			return aggr_cpu_id__core;
-
-		return NULL;
+		return aggr_cpu_id__cpu;
 	case AGGR_GLOBAL:
+		return aggr_cpu_id__global;
 	case AGGR_THREAD:
 	case AGGR_UNSET:
 	case AGGR_MAX:
@@ -1415,11 +1435,9 @@ static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode)
 	case AGGR_NODE:
 		return perf_stat__get_node_cached;
 	case AGGR_NONE:
-		if (term_percore_set()) {
-			return perf_stat__get_core_cached;
-		}
-		return NULL;
+		return perf_stat__get_cpu_cached;
 	case AGGR_GLOBAL:
+		return perf_stat__get_global_cached;
 	case AGGR_THREAD:
 	case AGGR_UNSET:
 	case AGGR_MAX:
@@ -1434,8 +1452,9 @@ static int perf_stat_init_aggr_mode(void)
 	aggr_cpu_id_get_t get_id = aggr_mode__get_aggr(stat_config.aggr_mode);
 
 	if (get_id) {
+		bool needs_sort = stat_config.aggr_mode != AGGR_NONE;
 		stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus,
-							 get_id, /*data=*/NULL);
+							 get_id, /*data=*/NULL, needs_sort);
 		if (!stat_config.aggr_map) {
 			pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]);
 			return -1;
@@ -1443,6 +1462,21 @@ static int perf_stat_init_aggr_mode(void)
 		stat_config.aggr_get_id = aggr_mode__get_id(stat_config.aggr_mode);
 	}
 
+	if (stat_config.aggr_mode == AGGR_THREAD) {
+		nr = perf_thread_map__nr(evsel_list->core.threads);
+		stat_config.aggr_map = cpu_aggr_map__empty_new(nr);
+		if (stat_config.aggr_map == NULL)
+			return -ENOMEM;
+
+		for (int s = 0; s < nr; s++) {
+			struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+			id.thread_idx = s;
+			stat_config.aggr_map->map[s] = id;
+		}
+		return 0;
+	}
+
 	/*
 	 * The evsel_list->cpus is the base we operate on,
 	 * taking the highest cpu number to be the size of
@@ -1527,6 +1561,26 @@ static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, vo
 	return id;
 }
 
+static struct aggr_cpu_id perf_env__get_cpu_aggr_by_cpu(struct perf_cpu cpu, void *data)
+{
+	struct perf_env *env = data;
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+	if (cpu.cpu != -1) {
+		/*
+		 * core_id is relative to socket and die,
+		 * we need a global id. So we set
+		 * socket, die id and core id
+		 */
+		id.socket = env->cpu[cpu.cpu].socket_id;
+		id.die = env->cpu[cpu.cpu].die_id;
+		id.core = env->cpu[cpu.cpu].core_id;
+		id.cpu = cpu;
+	}
+
+	return id;
+}
+
 static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(struct perf_cpu cpu, void *data)
 {
 	struct aggr_cpu_id id = aggr_cpu_id__empty();
@@ -1535,6 +1589,16 @@ static struct aggr_cpu_id perf_env__get_node_aggr_by_cpu(struct perf_cpu cpu, vo
 	return id;
 }
 
+static struct aggr_cpu_id perf_env__get_global_aggr_by_cpu(struct perf_cpu cpu __maybe_unused,
+							   void *data __maybe_unused)
+{
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+	/* it always aggregates to the cpu 0 */
+	id.cpu = (struct perf_cpu){ .cpu = 0 };
+	return id;
+}
+
 static struct aggr_cpu_id perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused,
 						     struct perf_cpu cpu)
 {
@@ -1552,12 +1616,24 @@ static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *conf
 	return perf_env__get_core_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
+static struct aggr_cpu_id perf_stat__get_cpu_file(struct perf_stat_config *config __maybe_unused,
+						  struct perf_cpu cpu)
+{
+	return perf_env__get_cpu_aggr_by_cpu(cpu, &perf_stat.session->header.env);
+}
+
 static struct aggr_cpu_id perf_stat__get_node_file(struct perf_stat_config *config __maybe_unused,
 						   struct perf_cpu cpu)
 {
 	return perf_env__get_node_aggr_by_cpu(cpu, &perf_stat.session->header.env);
 }
 
+static struct aggr_cpu_id perf_stat__get_global_file(struct perf_stat_config *config __maybe_unused,
+						     struct perf_cpu cpu)
+{
+	return perf_env__get_global_aggr_by_cpu(cpu, &perf_stat.session->header.env);
+}
+
 static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode)
 {
 	switch (aggr_mode) {
@@ -1569,8 +1645,10 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode)
 		return perf_env__get_core_aggr_by_cpu;
 	case AGGR_NODE:
 		return perf_env__get_node_aggr_by_cpu;
-	case AGGR_NONE:
 	case AGGR_GLOBAL:
+		return perf_env__get_global_aggr_by_cpu;
+	case AGGR_NONE:
+		return perf_env__get_cpu_aggr_by_cpu;
 	case AGGR_THREAD:
 	case AGGR_UNSET:
 	case AGGR_MAX:
@@ -1590,8 +1668,10 @@ static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode)
 		return perf_stat__get_core_file;
 	case AGGR_NODE:
 		return perf_stat__get_node_file;
-	case AGGR_NONE:
 	case AGGR_GLOBAL:
+		return perf_stat__get_global_file;
+	case AGGR_NONE:
+		return perf_stat__get_cpu_file;
 	case AGGR_THREAD:
 	case AGGR_UNSET:
 	case AGGR_MAX:
@@ -1604,11 +1684,29 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
 {
 	struct perf_env *env = &st->session->header.env;
 	aggr_cpu_id_get_t get_id = aggr_mode__get_aggr_file(stat_config.aggr_mode);
+	bool needs_sort = stat_config.aggr_mode != AGGR_NONE;
+
+	if (stat_config.aggr_mode == AGGR_THREAD) {
+		int nr = perf_thread_map__nr(evsel_list->core.threads);
+
+		stat_config.aggr_map = cpu_aggr_map__empty_new(nr);
+		if (stat_config.aggr_map == NULL)
+			return -ENOMEM;
+
+		for (int s = 0; s < nr; s++) {
+			struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+			id.thread_idx = s;
+			stat_config.aggr_map->map[s] = id;
+		}
+		return 0;
+	}
 
 	if (!get_id)
 		return 0;
 
-	stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus, get_id, env);
+	stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus,
+						 get_id, env, needs_sort);
 	if (!stat_config.aggr_map) {
 		pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]);
 		return -1;
@@ -1991,13 +2089,11 @@ static int process_stat_round_event(struct perf_session *session,
 				    union perf_event *event)
 {
 	struct perf_record_stat_round *stat_round = &event->stat_round;
-	struct evsel *counter;
 	struct timespec tsh, *ts = NULL;
 	const char **argv = session->header.env.cmdline_argv;
 	int argc = session->header.env.nr_cmdline;
 
-	evlist__for_each_entry(evsel_list, counter)
-		perf_stat_process_counter(&stat_config, counter);
+	process_counters();
 
 	if (stat_round->type == PERF_STAT_ROUND_TYPE__FINAL)
 		update_stats(&walltime_nsecs_stats, stat_round->time);
@@ -2024,17 +2120,23 @@ int process_stat_config_event(struct perf_session *session,
 	if (perf_cpu_map__empty(st->cpus)) {
 		if (st->aggr_mode != AGGR_UNSET)
 			pr_warning("warning: processing task data, aggregation mode not set\n");
-		return 0;
-	}
-
-	if (st->aggr_mode != AGGR_UNSET)
+	} else if (st->aggr_mode != AGGR_UNSET) {
 		stat_config.aggr_mode = st->aggr_mode;
+	}
 
 	if (perf_stat.data.is_pipe)
 		perf_stat_init_aggr_mode();
 	else
 		perf_stat_init_aggr_mode_file(st);
 
+	if (stat_config.aggr_map) {
+		int nr_aggr = stat_config.aggr_map->nr;
+
+		if (evlist__alloc_aggr_stats(session->evlist, nr_aggr) < 0) {
+			pr_err("cannot allocate aggr counts\n");
+			return -1;
+		}
+	}
 	return 0;
 }
 
@@ -2048,7 +2150,7 @@ static int set_maps(struct perf_stat *st)
 
 	perf_evlist__set_maps(&evsel_list->core, st->cpus, st->threads);
 
-	if (evlist__alloc_stats(evsel_list, true))
+	if (evlist__alloc_stats(&stat_config, evsel_list, /*alloc_raw=*/true))
 		return -ENOMEM;
 
 	st->maps_allocated = true;
@@ -2277,7 +2379,7 @@ int cmd_stat(int argc, const char **argv)
 		goto out;
 	}
 
-	if (!output && !stat_config.quiet) {
+	if (!output && !quiet) {
 		struct timespec tm;
 		mode = append_file ? "a" : "w";
 
@@ -2297,6 +2399,14 @@ int cmd_stat(int argc, const char **argv)
 		}
 	}
 
+	if (stat_config.interval_clear && !isatty(fileno(output))) {
+		fprintf(stderr, "--interval-clear does not work with output\n");
+		parse_options_usage(stat_usage, stat_options, "o", 1);
+		parse_options_usage(NULL, stat_options, "log-fd", 0);
+		parse_options_usage(NULL, stat_options, "interval-clear", 0);
+		return -1;
+	}
+
 	stat_config.output = output;
 
 	/*
@@ -2495,10 +2605,10 @@ int cmd_stat(int argc, const char **argv)
 		goto out;
 	}
 
-	if (evlist__alloc_stats(evsel_list, interval))
+	if (perf_stat_init_aggr_mode())
 		goto out;
 
-	if (perf_stat_init_aggr_mode())
+	if (evlist__alloc_stats(&stat_config, evsel_list, interval))
 		goto out;
 
 	/*
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index c36296bb7637..6c629e7d370a 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -38,6 +38,7 @@
 #include "util/string2.h"
 #include "util/tracepoint.h"
 #include <linux/err.h>
+#include <traceevent/event-parse.h>
 
 #ifdef LACKS_OPEN_MEMSTREAM_PROTOTYPE
 FILE *open_memstream(char **ptr, size_t *sizeloc);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4b3ff7687236..d4b5b02bab73 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -87,8 +87,8 @@
 #include <linux/ctype.h>
 #include <perf/mmap.h>
 
-static volatile int done;
-static volatile int resize;
+static volatile sig_atomic_t done;
+static volatile sig_atomic_t resize;
 
 #define HEADER_LINE_NR  5
 
@@ -1471,8 +1471,6 @@ int cmd_top(int argc, const char **argv)
 			    "dump the symbol table used for profiling"),
 	OPT_INTEGER('f', "count-filter", &top.count_filter,
 		    "only display functions with more events than this"),
-	OPT_BOOLEAN(0, "group", &opts->group,
-			    "put the counters into a counter group"),
 	OPT_BOOLEAN('i', "no-inherit", &opts->no_inherit,
 		    "child tasks do not inherit counters"),
 	OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name",
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d3c757769b96..86e06f136f40 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -15,7 +15,6 @@
  */
 
 #include "util/record.h"
-#include <traceevent/event-parse.h>
 #include <api/fs/tracing_path.h>
 #include <bpf/bpf.h>
 #include "util/bpf_map.h"
@@ -80,6 +79,10 @@
 #include <linux/ctype.h>
 #include <perf/mmap.h>
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 #ifndef O_CLOEXEC
 # define O_CLOEXEC		02000000
 #endif
@@ -88,6 +91,8 @@
 # define F_LINUX_SPECIFIC_BASE	1024
 #endif
 
+#define RAW_SYSCALL_ARGS_NUM	6
+
 /*
  * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100
  */
@@ -108,7 +113,7 @@ struct syscall_fmt {
 		const char *sys_enter,
 			   *sys_exit;
 	}	   bpf_prog_name;
-	struct syscall_arg_fmt arg[6];
+	struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM];
 	u8	   nr_args;
 	bool	   errpid;
 	bool	   timeout;
@@ -120,7 +125,6 @@ struct trace {
 	struct syscalltbl	*sctbl;
 	struct {
 		struct syscall  *table;
-		struct bpf_map  *map;
 		struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
 			struct bpf_map  *sys_enter,
 					*sys_exit;
@@ -924,6 +928,8 @@ static struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
 	{ .name     = "clock_gettime",
 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
+	{ .name	    = "clock_nanosleep",
+	  .arg = { [2] = { .scnprintf = SCA_TIMESPEC,  /* rqtp */ }, }, },
 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
@@ -1053,7 +1059,8 @@ static struct syscall_fmt syscall_fmts[] = {
 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
 	{ .name	    = "perf_event_open",
-	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
+	  .arg = { [0] = { .scnprintf = SCA_PERF_ATTR,  /* attr */ },
+		   [2] = { .scnprintf = SCA_INT,	/* cpu */ },
 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
 	{ .name	    = "pipe2",
@@ -1220,16 +1227,6 @@ struct syscall {
 };
 
 /*
- * Must match what is in the BPF program:
- *
- * tools/perf/examples/bpf/augmented_raw_syscalls.c
- */
-struct bpf_map_syscall_entry {
-	bool	enabled;
-	u16	string_args_len[6];
-};
-
-/*
  * We need to have this 'calculated' boolean because in some cases we really
  * don't know what is the duration of a syscall, for instance, when we start
  * a session and some threads are waiting for a syscall to finish, say 'poll',
@@ -1535,8 +1532,8 @@ static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
 }
 
 static pid_t workload_pid = -1;
-static bool done = false;
-static bool interrupted = false;
+static volatile sig_atomic_t done = false;
+static volatile sig_atomic_t interrupted = false;
 
 static void sighandler_interrupt(int sig __maybe_unused)
 {
@@ -1658,7 +1655,7 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
 {
 	int idx;
 
-	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
+	if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0)
 		nr_args = sc->fmt->nr_args;
 
 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
@@ -1730,7 +1727,7 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field
 			 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
 			/*
 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
-			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
+			 * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
 			 * 65 int
 			 * 23 unsigned int
 			 * 7 unsigned long
@@ -1791,11 +1788,11 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 #endif
 	sc = trace->syscalls.table + id;
 	if (sc->nonexistent)
-		return 0;
+		return -EEXIST;
 
 	if (name == NULL) {
 		sc->nonexistent = true;
-		return 0;
+		return -EEXIST;
 	}
 
 	sc->name = name;
@@ -1809,11 +1806,18 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
 	}
 
-	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
-		return -ENOMEM;
-
-	if (IS_ERR(sc->tp_format))
+	/*
+	 * Fails to read trace point format via sysfs node, so the trace point
+	 * doesn't exist.  Set the 'nonexistent' flag as true.
+	 */
+	if (IS_ERR(sc->tp_format)) {
+		sc->nonexistent = true;
 		return PTR_ERR(sc->tp_format);
+	}
+
+	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ?
+					RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields))
+		return -ENOMEM;
 
 	sc->args = sc->tp_format->format.fields;
 	/*
@@ -2131,11 +2135,8 @@ static struct syscall *trace__syscall_info(struct trace *trace,
 	    (err = trace__read_syscall_info(trace, id)) != 0)
 		goto out_cant_read;
 
-	if (trace->syscalls.table[id].name == NULL) {
-		if (trace->syscalls.table[id].nonexistent)
-			return NULL;
+	if (trace->syscalls.table && trace->syscalls.table[id].nonexistent)
 		goto out_cant_read;
-	}
 
 	return &trace->syscalls.table[id];
 
@@ -2728,8 +2729,10 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
 				offset = format_field__intval(field, sample, evsel->needs_swap);
 				syscall_arg.len = offset >> 16;
 				offset &= 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 				if (field->flags & TEP_FIELD_IS_RELATIVE)
 					offset += field->offset + field->size;
+#endif
 			}
 
 			val = (uintptr_t)(sample->raw_data + offset);
@@ -3250,7 +3253,6 @@ static void trace__set_bpf_map_filtered_pids(struct trace *trace)
 
 static void trace__set_bpf_map_syscalls(struct trace *trace)
 {
-	trace->syscalls.map = trace__find_bpf_map_by_name(trace, "syscalls");
 	trace->syscalls.prog_array.sys_enter = trace__find_bpf_map_by_name(trace, "syscalls_sys_enter");
 	trace->syscalls.prog_array.sys_exit  = trace__find_bpf_map_by_name(trace, "syscalls_sys_exit");
 }
@@ -3330,80 +3332,6 @@ static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
 	return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
 }
 
-static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry)
-{
-	struct syscall *sc = trace__syscall_info(trace, NULL, id);
-	int arg = 0;
-
-	if (sc == NULL)
-		goto out;
-
-	for (; arg < sc->nr_args; ++arg) {
-		entry->string_args_len[arg] = 0;
-		if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) {
-			/* Should be set like strace -s strsize */
-			entry->string_args_len[arg] = PATH_MAX;
-		}
-	}
-out:
-	for (; arg < 6; ++arg)
-		entry->string_args_len[arg] = 0;
-}
-static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
-{
-	int fd = bpf_map__fd(trace->syscalls.map);
-	struct bpf_map_syscall_entry value = {
-		.enabled = !trace->not_ev_qualifier,
-	};
-	int err = 0;
-	size_t i;
-
-	for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
-		int key = trace->ev_qualifier_ids.entries[i];
-
-		if (value.enabled) {
-			trace__init_bpf_map_syscall_args(trace, key, &value);
-			trace__init_syscall_bpf_progs(trace, key);
-		}
-
-		err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-
-static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
-{
-	int fd = bpf_map__fd(trace->syscalls.map);
-	struct bpf_map_syscall_entry value = {
-		.enabled = enabled,
-	};
-	int err = 0, key;
-
-	for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
-		if (enabled)
-			trace__init_bpf_map_syscall_args(trace, key, &value);
-
-		err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-
-static int trace__init_syscalls_bpf_map(struct trace *trace)
-{
-	bool enabled = true;
-
-	if (trace->ev_qualifier_ids.nr)
-		enabled = trace->not_ev_qualifier;
-
-	return __trace__init_syscalls_bpf_map(trace, enabled);
-}
-
 static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
 {
 	struct tep_format_field *field, *candidate_field;
@@ -3618,16 +3546,6 @@ static void trace__set_bpf_map_syscalls(struct trace *trace __maybe_unused)
 {
 }
 
-static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
-{
-	return 0;
-}
-
-static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
-{
-	return 0;
-}
-
 static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace __maybe_unused,
 							    const char *name __maybe_unused)
 {
@@ -3661,8 +3579,6 @@ static bool trace__only_augmented_syscalls_evsels(struct trace *trace)
 
 static int trace__set_ev_qualifier_filter(struct trace *trace)
 {
-	if (trace->syscalls.map)
-		return trace__set_ev_qualifier_bpf_filter(trace);
 	if (trace->syscalls.events.sys_enter)
 		return trace__set_ev_qualifier_tp_filter(trace);
 	return 0;
@@ -4036,9 +3952,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (err < 0)
 		goto out_error_mem;
 
-	if (trace->syscalls.map)
-		trace__init_syscalls_bpf_map(trace);
-
 	if (trace->syscalls.prog_array.sys_enter)
 		trace__init_syscalls_bpf_prog_array_maps(trace);
 
@@ -4092,8 +4005,8 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	}
 
 	trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
-				  evlist->core.threads->nr > 1 ||
-				  evlist__first(evlist)->core.attr.inherit;
+		perf_thread_map__nr(evlist->core.threads) > 1 ||
+		evlist__first(evlist)->core.attr.inherit;
 
 	/*
 	 * Now that we already used evsel->core.attr to ask the kernel to setup the
diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
index a71f491224da..a886929ec6e5 100644
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@@ -82,6 +82,7 @@ static void library_status(void)
 	STATUS(HAVE_AIO_SUPPORT, aio);
 	STATUS(HAVE_ZSTD_SUPPORT, zstd);
 	STATUS(HAVE_LIBPFM, libpfm4);
+	STATUS(HAVE_LIBTRACEEVENT, libtraceevent);
 }
 
 int cmd_version(int argc, const char **argv)
diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c
index e6b6181c6dc6..3bd7fc17631f 100644
--- a/tools/perf/examples/bpf/5sec.c
+++ b/tools/perf/examples/bpf/5sec.c
@@ -39,13 +39,15 @@
    Copyright (C) 2018 Red Hat, Inc., Arnaldo Carvalho de Melo <acme@redhat.com>
 */
 
-#include <bpf.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
 
 #define NSEC_PER_SEC	1000000000L
 
-int probe(hrtimer_nanosleep, rqtp)(void *ctx, int err, long long sec)
+SEC("hrtimer_nanosleep=hrtimer_nanosleep rqtp")
+int hrtimer_nanosleep(void *ctx, int err, long long sec)
 {
 	return sec / NSEC_PER_SEC == 5ULL;
 }
 
-license(GPL);
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index a262dcd020f4..9a03189d33d3 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -14,39 +14,52 @@
  * code that will combine entry/exit in a strace like way.
  */
 
-#include <unistd.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
 #include <linux/limits.h>
-#include <linux/socket.h>
-#include <pid_filter.h>
 
-/* bpf-output associated map */
-bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
+// FIXME: These should come from system headers
+typedef char bool;
+typedef int pid_t;
+typedef long long int __s64;
+typedef __s64 time64_t;
 
-/*
- * string_args_len: one per syscall arg, 0 means not a string or don't copy it,
- * 		    PATH_MAX for copying everything, any other value to limit
- * 		    it a la 'strace -s strsize'.
- */
-struct syscall {
-	bool	enabled;
-	u16	string_args_len[6];
+struct timespec64 {
+	time64_t	tv_sec;
+	long int	tv_nsec;
 };
 
-bpf_map(syscalls, ARRAY, int, struct syscall, 512);
+/* bpf-output associated map */
+struct __augmented_syscalls__ {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__type(key, int);
+	__type(value, __u32);
+	__uint(max_entries, __NR_CPUS__);
+} __augmented_syscalls__ SEC(".maps");
 
 /*
  * What to augment at entry?
  *
  * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
  */
-bpf_map(syscalls_sys_enter, PROG_ARRAY, u32, u32, 512);
+struct syscalls_sys_enter {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, 512);
+} syscalls_sys_enter SEC(".maps");
 
 /*
  * What to augment at exit?
  *
  * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
  */
-bpf_map(syscalls_sys_exit, PROG_ARRAY, u32, u32, 512);
+struct syscalls_sys_exit {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__type(key, __u32);
+	__type(value, __u32);
+	__uint(max_entries, 512);
+} syscalls_sys_exit SEC(".maps");
 
 struct syscall_enter_args {
 	unsigned long long common_tp_fields;
@@ -66,7 +79,38 @@ struct augmented_arg {
 	char		value[PATH_MAX];
 };
 
-pid_filter(pids_filtered);
+struct pids_filtered {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, bool);
+	__uint(max_entries, 64);
+} pids_filtered SEC(".maps");
+
+/*
+ * Desired design of maximum size and alignment (see RFC2553)
+ */
+#define SS_MAXSIZE   128     /* Implementation specific max size */
+
+typedef unsigned short sa_family_t;
+
+/*
+ * FIXME: Should come from system headers
+ *
+ * The definition uses anonymous union and struct in order to control the
+ * default alignment.
+ */
+struct sockaddr_storage {
+	union {
+		struct {
+			sa_family_t    ss_family; /* address family */
+			/* Following field(s) are implementation specific */
+			char __data[SS_MAXSIZE - sizeof(unsigned short)];
+				/* space to achieve desired size, */
+				/* _SS_MAXSIZE value minus size of ss_family */
+		};
+		void *__align; /* implementation specific desired alignment */
+	};
+};
 
 struct augmented_args_payload {
        struct syscall_enter_args args;
@@ -75,11 +119,17 @@ struct augmented_args_payload {
 			struct augmented_arg arg, arg2;
 		};
 		struct sockaddr_storage saddr;
+		char   __data[sizeof(struct augmented_arg)];
 	};
 };
 
 // We need more tmp space than the BPF stack can give us
-bpf_map(augmented_args_tmp, PERCPU_ARRAY, int, struct augmented_args_payload, 1);
+struct augmented_args_tmp {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__type(key, int);
+	__type(value, struct augmented_args_payload);
+	__uint(max_entries, 1);
+} augmented_args_tmp SEC(".maps");
 
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
@@ -90,14 +140,14 @@ static inline struct augmented_args_payload *augmented_args_payload(void)
 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
 {
 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
-	return perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
+	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
 }
 
 static inline
 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
 {
 	unsigned int augmented_len = sizeof(*augmented_arg);
-	int string_len = probe_read_str(&augmented_arg->value, arg_len, arg);
+	int string_len = bpf_probe_read_str(&augmented_arg->value, arg_len, arg);
 
 	augmented_arg->size = augmented_arg->err = 0;
 	/*
@@ -146,7 +196,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
 	if (socklen > sizeof(augmented_args->saddr))
 		socklen = sizeof(augmented_args->saddr);
 
-	probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
+	bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
 
 	return augmented__output(args, augmented_args, len + socklen);
 }
@@ -165,7 +215,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
 	if (socklen > sizeof(augmented_args->saddr))
 		socklen = sizeof(augmented_args->saddr);
 
-	probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
+	bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
 
 	return augmented__output(args, augmented_args, len + socklen);
 }
@@ -234,6 +284,80 @@ int sys_enter_renameat(struct syscall_enter_args *args)
 	return augmented__output(args, augmented_args, len);
 }
 
+#define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
+
+// we need just the start, get the size to then copy it
+struct perf_event_attr_size {
+        __u32                   type;
+        /*
+         * Size of the attr structure, for fwd/bwd compat.
+         */
+        __u32                   size;
+};
+
+SEC("!syscalls:sys_enter_perf_event_open")
+int sys_enter_perf_event_open(struct syscall_enter_args *args)
+{
+	struct augmented_args_payload *augmented_args = augmented_args_payload();
+	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
+	unsigned int len = sizeof(augmented_args->args);
+
+        if (augmented_args == NULL)
+		goto failure;
+
+	if (bpf_probe_read(&augmented_args->__data, sizeof(*attr), attr) < 0)
+		goto failure;
+
+	attr_read = (const struct perf_event_attr_size *)augmented_args->__data;
+
+	__u32 size = attr_read->size;
+
+	if (!size)
+		size = PERF_ATTR_SIZE_VER0;
+
+	if (size > sizeof(augmented_args->__data))
+                goto failure;
+
+	// Now that we read attr->size and tested it against the size limits, read it completely
+	if (bpf_probe_read(&augmented_args->__data, size, attr) < 0)
+		goto failure;
+
+	return augmented__output(args, augmented_args, len + size);
+failure:
+	return 1; /* Failure: don't filter */
+}
+
+SEC("!syscalls:sys_enter_clock_nanosleep")
+int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
+{
+	struct augmented_args_payload *augmented_args = augmented_args_payload();
+	const void *rqtp_arg = (const void *)args->args[2];
+	unsigned int len = sizeof(augmented_args->args);
+	__u32 size = sizeof(struct timespec64);
+
+        if (augmented_args == NULL)
+		goto failure;
+
+	if (size > sizeof(augmented_args->__data))
+                goto failure;
+
+	bpf_probe_read(&augmented_args->__data, size, rqtp_arg);
+
+	return augmented__output(args, augmented_args, len + size);
+failure:
+	return 1; /* Failure: don't filter */
+}
+
+static pid_t getpid(void)
+{
+	return bpf_get_current_pid_tgid();
+}
+
+static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
+{
+	return bpf_map_lookup_elem(pids, &pid) != NULL;
+}
+
 SEC("raw_syscalls:sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
@@ -248,7 +372,6 @@ int sys_enter(struct syscall_enter_args *args)
 	 * initial, non-augmented raw_syscalls:sys_enter payload.
 	 */
 	unsigned int len = sizeof(augmented_args->args);
-	struct syscall *syscall;
 
 	if (pid_filter__has(&pids_filtered, getpid()))
 		return 0;
@@ -257,7 +380,7 @@ int sys_enter(struct syscall_enter_args *args)
 	if (augmented_args == NULL)
 		return 1;
 
-	probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
+	bpf_probe_read(&augmented_args->args, sizeof(augmented_args->args), args);
 
 	/*
 	 * Jump to syscall specific augmenter, even if the default one,
@@ -278,7 +401,7 @@ int sys_exit(struct syscall_exit_args *args)
 	if (pid_filter__has(&pids_filtered, getpid()))
 		return 0;
 
-	probe_read(&exit_args, sizeof(exit_args), args);
+	bpf_probe_read(&exit_args, sizeof(exit_args), args);
 	/*
 	 * Jump to syscall specific return augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
@@ -291,4 +414,4 @@ int sys_exit(struct syscall_exit_args *args)
 	return 0;
 }
 
-license(GPL);
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/examples/bpf/augmented_syscalls.c b/tools/perf/examples/bpf/augmented_syscalls.c
deleted file mode 100644
index 524fdb8534b3..000000000000
--- a/tools/perf/examples/bpf/augmented_syscalls.c
+++ /dev/null
@@ -1,169 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Augment syscalls with the contents of the pointer arguments.
- *
- * Test it with:
- *
- * perf trace -e tools/perf/examples/bpf/augmented_syscalls.c cat /etc/passwd > /dev/null
- *
- * It'll catch some openat syscalls related to the dynamic linked and
- * the last one should be the one for '/etc/passwd'.
- *
- * This matches what is marshalled into the raw_syscall:sys_enter payload
- * expected by the 'perf trace' beautifiers, and can be used by them, that will
- * check if perf_sample->raw_data is more than what is expected for each
- * syscalls:sys_{enter,exit}_SYSCALL tracepoint, uing the extra data as the
- * contents of pointer arguments.
- */
-
-#include <stdio.h>
-#include <linux/socket.h>
-
-/* bpf-output associated map */
-bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
-
-struct syscall_exit_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   ret;
-};
-
-struct augmented_filename {
-	unsigned int	size;
-	int		reserved;
-	char		value[256];
-};
-
-#define augmented_filename_syscall(syscall)							\
-struct augmented_enter_##syscall##_args {			 				\
-	struct syscall_enter_##syscall##_args	args;				 		\
-	struct augmented_filename		filename;				 	\
-};												\
-int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args)				\
-{												\
-	struct augmented_enter_##syscall##_args augmented_args = { .filename.reserved = 0, }; 	\
-	unsigned int len = sizeof(augmented_args);						\
-	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);			\
-	augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, 		\
-						      sizeof(augmented_args.filename.value), 	\
-						      args->filename_ptr); 			\
-	if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {		\
-		len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;	\
-		len &= sizeof(augmented_args.filename.value) - 1;				\
-	}											\
-	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */	\
-	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, 		\
-				 &augmented_args, len);						\
-}												\
-int syscall_exit(syscall)(struct syscall_exit_args *args)					\
-{												\
-       return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */	\
-}
-
-struct syscall_enter_openat_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   dfd;
-	char		   *filename_ptr;
-	long		   flags;
-	long		   mode;
-};
-
-augmented_filename_syscall(openat);
-
-struct syscall_enter_open_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	char		   *filename_ptr;
-	long		   flags;
-	long		   mode;
-};
-
-augmented_filename_syscall(open);
-
-struct syscall_enter_inotify_add_watch_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   fd;
-	char		   *filename_ptr;
-	long		   mask;
-};
-
-augmented_filename_syscall(inotify_add_watch);
-
-struct statbuf;
-
-struct syscall_enter_newstat_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	char		   *filename_ptr;
-	struct stat	   *statbuf;
-};
-
-augmented_filename_syscall(newstat);
-
-#ifndef _K_SS_MAXSIZE
-#define _K_SS_MAXSIZE 128
-#endif
-
-#define augmented_sockaddr_syscall(syscall)						\
-struct augmented_enter_##syscall##_args {			 				\
-	struct syscall_enter_##syscall##_args	args;				 		\
-	struct sockaddr_storage			addr;						\
-};												\
-int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args)				\
-{												\
-	struct augmented_enter_##syscall##_args augmented_args;				 	\
-	unsigned long addrlen = sizeof(augmented_args.addr);					\
-	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);			\
-/* FIXME_CLANG_OPTIMIZATION_THAT_ACCESSES_USER_CONTROLLED_ADDRLEN_DESPITE_THIS_CHECK */		\
-/*	if (addrlen > augmented_args.args.addrlen)				     */		\
-/*		addrlen = augmented_args.args.addrlen;				     */		\
-/*										     */		\
-	probe_read(&augmented_args.addr, addrlen, args->addr_ptr); 				\
-	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */	\
-	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, 		\
-				 &augmented_args, 						\
-				sizeof(augmented_args) - sizeof(augmented_args.addr) + addrlen);\
-}												\
-int syscall_exit(syscall)(struct syscall_exit_args *args)					\
-{												\
-       return 1; /* 0 as soon as we start copying data returned by the kernel, e.g. 'read' */	\
-}
-
-struct sockaddr;
-
-struct syscall_enter_bind_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   fd;
-	struct sockaddr	   *addr_ptr;
-	unsigned long	   addrlen;
-};
-
-augmented_sockaddr_syscall(bind);
-
-struct syscall_enter_connect_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   fd;
-	struct sockaddr	   *addr_ptr;
-	unsigned long	   addrlen;
-};
-
-augmented_sockaddr_syscall(connect);
-
-struct syscall_enter_sendto_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   fd;
-	void		   *buff;
-	long		   len;
-	unsigned long	   flags;
-	struct sockaddr	   *addr_ptr;
-	long		   addr_len;
-};
-
-augmented_sockaddr_syscall(sendto);
-
-license(GPL);
diff --git a/tools/perf/examples/bpf/empty.c b/tools/perf/examples/bpf/empty.c
index 7d7fb0c9fe76..3e296c0c53d7 100644
--- a/tools/perf/examples/bpf/empty.c
+++ b/tools/perf/examples/bpf/empty.c
@@ -1,3 +1,12 @@
-#include <bpf/bpf.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
 
-license(GPL);
+struct syscall_enter_args;
+
+SEC("raw_syscalls:sys_enter")
+int sys_enter(struct syscall_enter_args *args)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/examples/bpf/etcsnoop.c b/tools/perf/examples/bpf/etcsnoop.c
deleted file mode 100644
index e81b535346c0..000000000000
--- a/tools/perf/examples/bpf/etcsnoop.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Augment the filename syscalls with the contents of the filename pointer argument
- * filtering only those that do not start with /etc/.
- *
- * Test it with:
- *
- * perf trace -e tools/perf/examples/bpf/augmented_syscalls.c cat /etc/passwd > /dev/null
- *
- * It'll catch some openat syscalls related to the dynamic linked and
- * the last one should be the one for '/etc/passwd'.
- *
- * This matches what is marshalled into the raw_syscall:sys_enter payload
- * expected by the 'perf trace' beautifiers, and can be used by them unmodified,
- * which will be done as that feature is implemented in the next csets, for now
- * it will appear in a dump done by the default tracepoint handler in 'perf trace',
- * that uses bpf_output__fprintf() to just dump those contents, as done with
- * the bpf-output event associated with the __bpf_output__ map declared in
- * tools/perf/include/bpf/stdio.h.
- */
-
-#include <stdio.h>
-
-/* bpf-output associated map */
-bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__);
-
-struct augmented_filename {
-	int	size;
-	int	reserved;
-	char	value[64];
-};
-
-#define augmented_filename_syscall_enter(syscall) 						\
-struct augmented_enter_##syscall##_args {			 				\
-	struct syscall_enter_##syscall##_args	args;				 		\
-	struct augmented_filename		filename;				 	\
-};												\
-int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args)				\
-{												\
-	char etc[6] = "/etc/";									\
-	struct augmented_enter_##syscall##_args augmented_args = { .filename.reserved = 0, }; 	\
-	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);			\
-	augmented_args.filename.size = probe_read_str(&augmented_args.filename.value, 		\
-						      sizeof(augmented_args.filename.value), 	\
-						      args->filename_ptr); 			\
-	if (__builtin_memcmp(augmented_args.filename.value, etc, 4) != 0)			\
-		return 0;									\
-	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */	\
-	return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, 		\
-				 &augmented_args,						\
-				 (sizeof(augmented_args) - sizeof(augmented_args.filename.value) + \
-				 augmented_args.filename.size));				\
-}
-
-struct syscall_enter_openat_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	long		   dfd;
-	char		   *filename_ptr;
-	long		   flags;
-	long		   mode;
-};
-
-augmented_filename_syscall_enter(openat);
-
-struct syscall_enter_open_args {
-	unsigned long long common_tp_fields;
-	long		   syscall_nr;
-	char		   *filename_ptr;
-	long		   flags;
-	long		   mode;
-};
-
-augmented_filename_syscall_enter(open);
-
-license(GPL);
diff --git a/tools/perf/examples/bpf/hello.c b/tools/perf/examples/bpf/hello.c
index cf3c2fdc7f79..e9080b0df158 100644
--- a/tools/perf/examples/bpf/hello.c
+++ b/tools/perf/examples/bpf/hello.c
@@ -1,9 +1,27 @@
-#include <stdio.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
 
-int syscall_enter(openat)(void *args)
+struct __bpf_stdout__ {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__type(key, int);
+	__type(value, __u32);
+	__uint(max_entries, __NR_CPUS__);
+} __bpf_stdout__ SEC(".maps");
+
+#define puts(from) \
+	({ const int __len = sizeof(from); \
+	   char __from[sizeof(from)] = from;			\
+	   bpf_perf_event_output(args, &__bpf_stdout__, BPF_F_CURRENT_CPU, \
+			  &__from, __len & (sizeof(from) - 1)); })
+
+struct syscall_enter_args;
+
+SEC("raw_syscalls:sys_enter")
+int sys_enter(struct syscall_enter_args *args)
 {
 	puts("Hello, world\n");
 	return 0;
 }
 
-license(GPL);
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/include/bpf/bpf.h b/tools/perf/include/bpf/bpf.h
deleted file mode 100644
index b422aeef5339..000000000000
--- a/tools/perf/include/bpf/bpf.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef _PERF_BPF_H
-#define _PERF_BPF_H
-
-#include <uapi/linux/bpf.h>
-
-/*
- * A helper structure used by eBPF C program to describe map attributes to
- * elf_bpf loader, taken from tools/testing/selftests/bpf/bpf_helpers.h:
- */
-struct bpf_map {
-        unsigned int type;
-        unsigned int key_size;
-        unsigned int value_size;
-        unsigned int max_entries;
-        unsigned int map_flags;
-        unsigned int inner_map_idx;
-        unsigned int numa_node;
-};
-
-#define bpf_map(name, _type, type_key, type_val, _max_entries)	\
-struct bpf_map SEC("maps") name = {				\
-	.type	     = BPF_MAP_TYPE_##_type,			\
-	.key_size    = sizeof(type_key),			\
-	.value_size  = sizeof(type_val),			\
-	.max_entries = _max_entries,				\
-};								\
-struct ____btf_map_##name {					\
-	type_key key;						\
-	type_val value;                                 	\
-};								\
-struct ____btf_map_##name __attribute__((section(".maps." #name), used)) \
-	____btf_map_##name = { }
-
-/*
- * FIXME: this should receive .max_entries as a parameter, as careful
- *	  tuning of these limits is needed to avoid hitting limits that
- *	  prevents other BPF constructs, such as tracepoint handlers,
- *	  to get installed, with cryptic messages from libbpf, etc.
- *	  For the current need, 'perf trace --filter-pids', 64 should
- *	  be good enough, but this surely needs to be revisited.
- */
-#define pid_map(name, value_type) bpf_map(name, HASH, pid_t, value_type, 64)
-
-static int (*bpf_map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags) = (void *)BPF_FUNC_map_update_elem;
-static void *(*bpf_map_lookup_elem)(struct bpf_map *map, void *key) = (void *)BPF_FUNC_map_lookup_elem;
-
-static void (*bpf_tail_call)(void *ctx, void *map, int index) = (void *)BPF_FUNC_tail_call;
-
-#define SEC(NAME) __attribute__((section(NAME),  used))
-
-#define probe(function, vars) \
-	SEC(#function "=" #function " " #vars) function
-
-#define syscall_enter(name) \
-	SEC("syscalls:sys_enter_" #name) syscall_enter_ ## name
-
-#define syscall_exit(name) \
-	SEC("syscalls:sys_exit_" #name) syscall_exit_ ## name
-
-#define license(name) \
-char _license[] SEC("license") = #name; \
-int _version SEC("version") = LINUX_VERSION_CODE;
-
-static int (*probe_read)(void *dst, int size, const void *unsafe_addr) = (void *)BPF_FUNC_probe_read;
-static int (*probe_read_str)(void *dst, int size, const void *unsafe_addr) = (void *)BPF_FUNC_probe_read_str;
-
-static int (*perf_event_output)(void *, struct bpf_map *, int, void *, unsigned long) = (void *)BPF_FUNC_perf_event_output;
-
-#endif /* _PERF_BPF_H */
diff --git a/tools/perf/include/bpf/linux/socket.h b/tools/perf/include/bpf/linux/socket.h
deleted file mode 100644
index 7f844568dab8..000000000000
--- a/tools/perf/include/bpf/linux/socket.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _UAPI_LINUX_SOCKET_H
-#define _UAPI_LINUX_SOCKET_H
-
-/*
- * Desired design of maximum size and alignment (see RFC2553)
- */
-#define _K_SS_MAXSIZE	128	/* Implementation specific max size */
-#define _K_SS_ALIGNSIZE	(__alignof__ (struct sockaddr *))
-				/* Implementation specific desired alignment */
-
-typedef unsigned short __kernel_sa_family_t;
-
-struct __kernel_sockaddr_storage {
-	__kernel_sa_family_t	ss_family;		/* address family */
-	/* Following field(s) are implementation specific */
-	char		__data[_K_SS_MAXSIZE - sizeof(unsigned short)];
-				/* space to achieve desired size, */
-				/* _SS_MAXSIZE value minus size of ss_family */
-} __attribute__ ((aligned(_K_SS_ALIGNSIZE)));	/* force desired alignment */
-
-#define sockaddr_storage __kernel_sockaddr_storage
-
-#endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/tools/perf/include/bpf/pid_filter.h b/tools/perf/include/bpf/pid_filter.h
deleted file mode 100644
index 6e61c4bdf548..000000000000
--- a/tools/perf/include/bpf/pid_filter.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1
-
-#ifndef _PERF_BPF_PID_FILTER_
-#define _PERF_BPF_PID_FILTER_
-
-#include <bpf.h>
-
-#define pid_filter(name) pid_map(name, bool)
-
-static int pid_filter__add(struct bpf_map *pids, pid_t pid)
-{
-	bool value = true;
-	return bpf_map_update_elem(pids, &pid, &value, BPF_NOEXIST);
-}
-
-static bool pid_filter__has(struct bpf_map *pids, pid_t pid)
-{
-	return bpf_map_lookup_elem(pids, &pid) != NULL;
-}
-
-#endif // _PERF_BPF_PID_FILTER_
diff --git a/tools/perf/include/bpf/stdio.h b/tools/perf/include/bpf/stdio.h
deleted file mode 100644
index 316af5b2ff35..000000000000
--- a/tools/perf/include/bpf/stdio.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <bpf.h>
-
-struct bpf_map SEC("maps") __bpf_stdout__ = {
-       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-       .key_size = sizeof(int),
-       .value_size = sizeof(u32),
-       .max_entries = __NR_CPUS__,
-};
-
-#define puts(from) \
-	({ const int __len = sizeof(from); \
-	   char __from[__len] = from; \
-	   perf_event_output(args, &__bpf_stdout__, BPF_F_CURRENT_CPU, \
-			  &__from, __len & (sizeof(from) - 1)); })
diff --git a/tools/perf/include/bpf/unistd.h b/tools/perf/include/bpf/unistd.h
deleted file mode 100644
index ca7877f9a976..000000000000
--- a/tools/perf/include/bpf/unistd.h
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1
-
-#include <bpf.h>
-
-static int (*bpf_get_current_pid_tgid)(void) = (void *)BPF_FUNC_get_current_pid_tgid;
-
-static pid_t getpid(void)
-{
-	return bpf_get_current_pid_tgid();
-}
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 7af135dea1cd..82bbe0ca858b 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -70,20 +70,26 @@ static struct cmd_struct commands[] = {
 	{ "report",	cmd_report,	0 },
 	{ "bench",	cmd_bench,	0 },
 	{ "stat",	cmd_stat,	0 },
+#ifdef HAVE_LIBTRACEEVENT
 	{ "timechart",	cmd_timechart,	0 },
+#endif
 	{ "top",	cmd_top,	0 },
 	{ "annotate",	cmd_annotate,	0 },
 	{ "version",	cmd_version,	0 },
 	{ "script",	cmd_script,	0 },
+#ifdef HAVE_LIBTRACEEVENT
 	{ "sched",	cmd_sched,	0 },
+#endif
 #ifdef HAVE_LIBELF_SUPPORT
 	{ "probe",	cmd_probe,	0 },
 #endif
+#ifdef HAVE_LIBTRACEEVENT
 	{ "kmem",	cmd_kmem,	0 },
 	{ "lock",	cmd_lock,	0 },
+#endif
 	{ "kvm",	cmd_kvm,	0 },
 	{ "test",	cmd_test,	0 },
-#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)
+#if defined(HAVE_LIBTRACEEVENT) && (defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT))
 	{ "trace",	cmd_trace,	0 },
 #endif
 	{ "inject",	cmd_inject,	0 },
@@ -91,7 +97,9 @@ static struct cmd_struct commands[] = {
 	{ "data",	cmd_data,	0 },
 	{ "ftrace",	cmd_ftrace,	0 },
 	{ "daemon",	cmd_daemon,	0 },
+#ifdef HAVE_LIBTRACEEVENT
 	{ "kwork",	cmd_kwork,	0 },
+#endif
 };
 
 struct pager_config {
@@ -500,14 +508,18 @@ int main(int argc, const char **argv)
 		argv[0] = cmd;
 	}
 	if (strstarts(cmd, "trace")) {
-#if defined(HAVE_LIBAUDIT_SUPPORT) || defined(HAVE_SYSCALL_TABLE_SUPPORT)
-		setup_path();
-		argv[0] = "trace";
-		return cmd_trace(argc, argv);
-#else
+#ifndef HAVE_LIBTRACEEVENT
+		fprintf(stderr,
+			"trace command not available: missing libtraceevent devel package at build time.\n");
+		goto out;
+#elif !defined(HAVE_LIBAUDIT_SUPPORT) && !defined(HAVE_SYSCALL_TABLE_SUPPORT)
 		fprintf(stderr,
 			"trace command not available: missing audit-libs devel package at build time.\n");
 		goto out;
+#else
+		setup_path();
+		argv[0] = "trace";
+		return cmd_trace(argc, argv);
 #endif
 	}
 	/* Look for flags.. */
diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index 04ef95174660..15b9e8fdbffa 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -21,7 +21,7 @@ $(OUTPUT)pmu-events/pmu-events.c: pmu-events/empty-pmu-events.c
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)cp $< $@
 else
-$(OUTPUT)pmu-events/pmu-events.c: $(JSON) $(JSON_TEST) $(JEVENTS_PY)
+$(OUTPUT)pmu-events/pmu-events.c: $(JSON) $(JSON_TEST) $(JEVENTS_PY) pmu-events/metric.py
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) pmu-events/arch $@
 endif
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/branch.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/branch.json
index 79f2016c53b0..79f2016c53b0 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/branch.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/branch.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/bus.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/bus.json
index 579c1c993d17..579c1c993d17 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/bus.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/bus.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/cache.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/cache.json
index 0141f749bff3..0141f749bff3 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/cache.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/cache.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/exception.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/exception.json
index 344a2d552ad5..344a2d552ad5 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/exception.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/exception.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/instruction.json
index e57cd55937c6..e57cd55937c6 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/instruction.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/instruction.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/memory.json
index 7b2b21ac150f..7b2b21ac150f 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/memory.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/pipeline.json
index f9fae15f7555..f9fae15f7555 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/pipeline.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/pipeline.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/spe.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/spe.json
index 20f2165c85fe..20f2165c85fe 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/spe.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/spe.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/trace.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/trace.json
index 3116135c59e2..3116135c59e2 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/trace.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2-v2/trace.json
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index ad502d00f460..f134e833c069 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -34,7 +34,8 @@
 0x00000000410fd460,v1,arm/cortex-a510,core
 0x00000000410fd470,v1,arm/cortex-a710,core
 0x00000000410fd480,v1,arm/cortex-x2,core
-0x00000000410fd490,v1,arm/neoverse-n2,core
+0x00000000410fd490,v1,arm/neoverse-n2-v2,core
+0x00000000410fd4f0,v1,arm/neoverse-n2-v2,core
 0x00000000420f5160,v1,cavium/thunderx2,core
 0x00000000430f0af0,v1,cavium/thunderx2,core
 0x00000000460f0010,v1,fujitsu/a64fx,core
diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv
new file mode 100644
index 000000000000..c61b3d6ef616
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv
@@ -0,0 +1,17 @@
+# Format:
+#	MVENDORID-MARCHID-MIMPID,Version,JSON/file/pathname,Type
+#
+# where
+#	MVENDORID	JEDEC code of the core provider
+#	MARCHID		base microarchitecture of the hart
+#	MIMPID		unique encoding of the version
+#			of the processor implementation
+#	Version could be used to track version of JSON file
+#		but currently unused.
+#	JSON/file/pathname is the path to JSON file, relative
+#		to tools/perf/pmu-events/arch/riscv/.
+#	Type is core, uncore etc
+#
+#
+#MVENDORID-MARCHID-MIMPID,Version,Filename,EventType
+0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core
diff --git a/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json
new file mode 100644
index 000000000000..a9939823b14b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/riscv-sbi-firmware.json
@@ -0,0 +1,134 @@
+[
+  {
+    "PublicDescription": "Misaligned load trap",
+    "ConfigCode": "0x8000000000000000",
+    "EventName": "FW_MISALIGNED_LOAD",
+    "BriefDescription": "Misaligned load trap event"
+  },
+  {
+    "PublicDescription": "Misaligned store trap",
+    "ConfigCode": "0x8000000000000001",
+    "EventName": "FW_MISALIGNED_STORE",
+    "BriefDescription": "Misaligned store trap event"
+  },
+  {
+    "PublicDescription": "Load access trap",
+    "ConfigCode": "0x8000000000000002",
+    "EventName": "FW_ACCESS_LOAD",
+    "BriefDescription": "Load access trap event"
+  },
+  {
+    "PublicDescription": "Store access trap",
+    "ConfigCode": "0x8000000000000003",
+    "EventName": "FW_ACCESS_STORE",
+    "BriefDescription": "Store access trap event"
+  },
+  {
+    "PublicDescription": "Illegal instruction trap",
+    "ConfigCode": "0x8000000000000004",
+    "EventName": "FW_ILLEGAL_INSN",
+    "BriefDescription": "Illegal instruction trap event"
+  },
+  {
+    "PublicDescription": "Set timer event",
+    "ConfigCode": "0x8000000000000005",
+    "EventName": "FW_SET_TIMER",
+    "BriefDescription": "Set timer event"
+  },
+  {
+    "PublicDescription": "Sent IPI to other HART event",
+    "ConfigCode": "0x8000000000000006",
+    "EventName": "FW_IPI_SENT",
+    "BriefDescription": "Sent IPI to other HART event"
+  },
+  {
+    "PublicDescription": "Received IPI from other HART event",
+    "ConfigCode": "0x8000000000000007",
+    "EventName": "FW_IPI_RECEIVED",
+    "BriefDescription": "Received IPI from other HART event"
+  },
+  {
+    "PublicDescription": "Sent FENCE.I request to other HART event",
+    "ConfigCode": "0x8000000000000008",
+    "EventName": "FW_FENCE_I_SENT",
+    "BriefDescription": "Sent FENCE.I request to other HART event"
+  },
+  {
+    "PublicDescription": "Received FENCE.I request from other HART event",
+    "ConfigCode": "0x8000000000000009",
+    "EventName": "FW_FENCE_I_RECEIVED",
+    "BriefDescription": "Received FENCE.I request from other HART event"
+  },
+  {
+    "PublicDescription": "Sent SFENCE.VMA request to other HART event",
+    "ConfigCode": "0x800000000000000a",
+    "EventName": "FW_SFENCE_VMA_SENT",
+    "BriefDescription": "Sent SFENCE.VMA request to other HART event"
+  },
+  {
+    "PublicDescription": "Received SFENCE.VMA request from other HART event",
+    "ConfigCode": "0x800000000000000b",
+    "EventName": "FW_SFENCE_VMA_RECEIVED",
+    "BriefDescription": "Received SFENCE.VMA request from other HART event"
+  },
+  {
+    "PublicDescription": "Sent SFENCE.VMA with ASID request to other HART event",
+    "ConfigCode": "0x800000000000000c",
+    "EventName": "FW_SFENCE_VMA_RECEIVED",
+    "BriefDescription": "Sent SFENCE.VMA with ASID request to other HART event"
+  },
+  {
+    "PublicDescription": "Received SFENCE.VMA with ASID request from other HART event",
+    "ConfigCode": "0x800000000000000d",
+    "EventName": "FW_SFENCE_VMA_ASID_RECEIVED",
+    "BriefDescription": "Received SFENCE.VMA with ASID request from other HART event"
+  },
+  {
+    "PublicDescription": "Sent HFENCE.GVMA request to other HART event",
+    "ConfigCode": "0x800000000000000e",
+    "EventName": "FW_HFENCE_GVMA_SENT",
+    "BriefDescription": "Sent HFENCE.GVMA request to other HART event"
+  },
+  {
+    "PublicDescription": "Received HFENCE.GVMA request from other HART event",
+    "ConfigCode": "0x800000000000000f",
+    "EventName": "FW_HFENCE_GVMA_RECEIVED",
+    "BriefDescription": "Received HFENCE.GVMA request from other HART event"
+  },
+  {
+    "PublicDescription": "Sent HFENCE.GVMA with VMID request to other HART event",
+    "ConfigCode": "0x8000000000000010",
+    "EventName": "FW_HFENCE_GVMA_VMID_SENT",
+    "BriefDescription": "Sent HFENCE.GVMA with VMID request to other HART event"
+  },
+  {
+    "PublicDescription": "Received HFENCE.GVMA with VMID request from other HART event",
+    "ConfigCode": "0x8000000000000011",
+    "EventName": "FW_HFENCE_GVMA_VMID_RECEIVED",
+    "BriefDescription": "Received HFENCE.GVMA with VMID request from other HART event"
+  },
+  {
+    "PublicDescription": "Sent HFENCE.VVMA request to other HART event",
+    "ConfigCode": "0x8000000000000012",
+    "EventName": "FW_HFENCE_VVMA_SENT",
+    "BriefDescription": "Sent HFENCE.VVMA request to other HART event"
+  },
+  {
+    "PublicDescription": "Received HFENCE.VVMA request from other HART event",
+    "ConfigCode": "0x8000000000000013",
+    "EventName": "FW_HFENCE_VVMA_RECEIVED",
+    "BriefDescription": "Received HFENCE.VVMA request from other HART event"
+  },
+  {
+    "PublicDescription": "Sent HFENCE.VVMA with ASID request to other HART event",
+    "ConfigCode": "0x8000000000000014",
+    "EventName": "FW_HFENCE_VVMA_ASID_SENT",
+    "BriefDescription": "Sent HFENCE.VVMA with ASID request to other HART event"
+  },
+  {
+    "PublicDescription": "Received HFENCE.VVMA with ASID request from other HART event",
+    "ConfigCode": "0x8000000000000015",
+    "EventName": "FW_HFENCE_VVMA_ASID_RECEIVED",
+    "BriefDescription": "Received HFENCE.VVMA with ASID request from other HART event"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json
new file mode 100644
index 000000000000..9b4a032186a7
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/firmware.json
@@ -0,0 +1,68 @@
+[
+  {
+    "ArchStdEvent": "FW_MISALIGNED_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_MISALIGNED_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ILLEGAL_INSN"
+  },
+  {
+    "ArchStdEvent": "FW_SET_TIMER"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/instructions.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/instructions.json
new file mode 100644
index 000000000000..5eab718c9256
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/instructions.json
@@ -0,0 +1,92 @@
+[
+  {
+    "EventName": "EXCEPTION_TAKEN",
+    "EventCode": "0x0000100",
+    "BriefDescription": "Exception taken"
+  },
+  {
+    "EventName": "INTEGER_LOAD_RETIRED",
+    "EventCode": "0x0000200",
+    "BriefDescription": "Integer load instruction retired"
+  },
+  {
+    "EventName": "INTEGER_STORE_RETIRED",
+    "EventCode": "0x0000400",
+    "BriefDescription": "Integer store instruction retired"
+  },
+  {
+    "EventName": "ATOMIC_MEMORY_RETIRED",
+    "EventCode": "0x0000800",
+    "BriefDescription": "Atomic memory operation retired"
+  },
+  {
+    "EventName": "SYSTEM_INSTRUCTION_RETIRED",
+    "EventCode": "0x0001000",
+    "BriefDescription": "System instruction retired"
+  },
+  {
+    "EventName": "INTEGER_ARITHMETIC_RETIRED",
+    "EventCode": "0x0002000",
+    "BriefDescription": "Integer arithmetic instruction retired"
+  },
+  {
+    "EventName": "CONDITIONAL_BRANCH_RETIRED",
+    "EventCode": "0x0004000",
+    "BriefDescription": "Conditional branch retired"
+  },
+  {
+    "EventName": "JAL_INSTRUCTION_RETIRED",
+    "EventCode": "0x0008000",
+    "BriefDescription": "JAL instruction retired"
+  },
+  {
+    "EventName": "JALR_INSTRUCTION_RETIRED",
+    "EventCode": "0x0010000",
+    "BriefDescription": "JALR instruction retired"
+  },
+  {
+    "EventName": "INTEGER_MULTIPLICATION_RETIRED",
+    "EventCode": "0x0020000",
+    "BriefDescription": "Integer multiplication instruction retired"
+  },
+  {
+    "EventName": "INTEGER_DIVISION_RETIRED",
+    "EventCode": "0x0040000",
+    "BriefDescription": "Integer division instruction retired"
+  },
+  {
+    "EventName": "FP_LOAD_RETIRED",
+    "EventCode": "0x0080000",
+    "BriefDescription": "Floating-point load instruction retired"
+  },
+  {
+    "EventName": "FP_STORE_RETIRED",
+    "EventCode": "0x0100000",
+    "BriefDescription": "Floating-point store instruction retired"
+  },
+  {
+    "EventName": "FP_ADDITION_RETIRED",
+    "EventCode": "0x0200000",
+    "BriefDescription": "Floating-point addition retired"
+  },
+  {
+    "EventName": "FP_MULTIPLICATION_RETIRED",
+    "EventCode": "0x0400000",
+    "BriefDescription": "Floating-point multiplication retired"
+  },
+  {
+    "EventName": "FP_FUSEDMADD_RETIRED",
+    "EventCode": "0x0800000",
+    "BriefDescription": "Floating-point fused multiply-add retired"
+  },
+  {
+    "EventName": "FP_DIV_SQRT_RETIRED",
+    "EventCode": "0x1000000",
+    "BriefDescription": "Floating-point division or square-root retired"
+  },
+  {
+    "EventName": "OTHER_FP_RETIRED",
+    "EventCode": "0x2000000",
+    "BriefDescription": "Other floating-point instruction retired"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/memory.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/memory.json
new file mode 100644
index 000000000000..be1a46312ac3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/memory.json
@@ -0,0 +1,32 @@
+[
+  {
+    "EventName": "ICACHE_RETIRED",
+    "EventCode": "0x0000102",
+    "BriefDescription": "Instruction cache miss"
+  },
+  {
+    "EventName": "DCACHE_MISS_MMIO_ACCESSES",
+    "EventCode": "0x0000202",
+    "BriefDescription": "Data cache miss or memory-mapped I/O access"
+  },
+  {
+    "EventName": "DCACHE_WRITEBACK",
+    "EventCode": "0x0000402",
+    "BriefDescription": "Data cache write-back"
+  },
+  {
+    "EventName": "INST_TLB_MISS",
+    "EventCode": "0x0000802",
+    "BriefDescription": "Instruction TLB miss"
+  },
+  {
+    "EventName": "DATA_TLB_MISS",
+    "EventCode": "0x0001002",
+    "BriefDescription": "Data TLB miss"
+  },
+  {
+    "EventName": "UTLB_MISS",
+    "EventCode": "0x0002002",
+    "BriefDescription": "UTLB miss"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/riscv/sifive/u74/microarch.json b/tools/perf/pmu-events/arch/riscv/sifive/u74/microarch.json
new file mode 100644
index 000000000000..50ffa55418cb
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/sifive/u74/microarch.json
@@ -0,0 +1,57 @@
+[
+  {
+    "EventName": "ADDRESSGEN_INTERLOCK",
+    "EventCode": "0x0000101",
+    "BriefDescription": "Address-generation interlock"
+  },
+  {
+    "EventName": "LONGLAT_INTERLOCK",
+    "EventCode": "0x0000201",
+    "BriefDescription": "Long-latency interlock"
+  },
+  {
+    "EventName": "CSR_READ_INTERLOCK",
+    "EventCode": "0x0000401",
+    "BriefDescription": "CSR read interlock"
+  },
+  {
+    "EventName": "ICACHE_ITIM_BUSY",
+    "EventCode": "0x0000801",
+    "BriefDescription": "Instruction cache/ITIM busy"
+  },
+  {
+    "EventName": "DCACHE_DTIM_BUSY",
+    "EventCode": "0x0001001",
+    "BriefDescription": "Data cache/DTIM busy"
+  },
+  {
+    "EventName": "BRANCH_DIRECTION_MISPREDICTION",
+    "EventCode": "0x0002001",
+    "BriefDescription": "Branch direction misprediction"
+  },
+  {
+    "EventName": "BRANCH_TARGET_MISPREDICTION",
+    "EventCode": "0x0004001",
+    "BriefDescription": "Branch/jump target misprediction"
+  },
+  {
+    "EventName": "PIPE_FLUSH_CSR_WRITE",
+    "EventCode": "0x0008001",
+    "BriefDescription": "Pipeline flush from CSR write"
+  },
+  {
+    "EventName": "PIPE_FLUSH_OTHER_EVENT",
+    "EventCode": "0x0010001",
+    "BriefDescription": "Pipeline flush from other event"
+  },
+  {
+    "EventName": "INTEGER_MULTIPLICATION_INTERLOCK",
+    "EventCode": "0x0020001",
+    "BriefDescription": "Integer multiplication interlock"
+  },
+  {
+    "EventName": "FP_INTERLOCK",
+    "EventCode": "0x0040001",
+    "BriefDescription": "Floating-point interlock"
+  }
+]
+\ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
index e06d26ad5138..edf440e9359a 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
@@ -1287,14 +1287,14 @@
     },
     {
         "BriefDescription": "Average CPU Utilization",
-        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricGroup": "HPC;Summary",
         "MetricName": "CPU_Utilization",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
-        "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time",
+        "MetricExpr": "Turbo_Utilization * TSC / 1000000000 / duration_time",
         "MetricGroup": "Power;Summary",
         "MetricName": "Average_Frequency",
         "Unit": "cpu_core"
@@ -1337,19 +1337,26 @@
     },
     {
         "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
-        "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000",
+        "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1000000 / duration_time / 1000",
         "MetricGroup": "HPC;Mem;MemoryBW;SoC",
         "MetricName": "DRAM_BW_Use",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Average number of parallel requests to external memory. Accounts for all requests",
-        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / arb@event\\=0x81\\,umask\\=0x1@",
+        "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
         "MetricGroup": "Mem;SoC",
         "MetricName": "MEM_Parallel_Requests",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Socket actual clocks when any core is active on that socket",
+        "MetricExpr": "UNC_CLOCK.SOCKET",
+        "MetricGroup": "SoC",
+        "MetricName": "Socket_CLKS",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
         "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
         "MetricGroup": "Branches;OS",
@@ -1357,6 +1364,12 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Uncore frequency per die [GHZ]",
+        "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000",
+        "MetricGroup": "SoC",
+        "MetricName": "UNCORE_FREQ"
+    },
+    {
         "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to frontend stalls.",
         "MetricExpr": "TOPDOWN_FE_BOUND.ALL / SLOTS",
         "MetricGroup": "TopdownL1",
@@ -1902,7 +1915,7 @@
     },
     {
         "BriefDescription": "Average CPU Utilization",
-        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
+        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
         "MetricName": "CPU_Utilization",
         "Unit": "cpu_atom"
     },
@@ -1950,62 +1963,72 @@
     },
     {
         "BriefDescription": "C1 residency percent per core",
-        "MetricExpr": "(cstate_core@c1\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_core@c1\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C1_Core_Residency"
+        "MetricName": "C1_Core_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C6 residency percent per core",
-        "MetricExpr": "(cstate_core@c6\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C6_Core_Residency"
+        "MetricName": "C6_Core_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C7 residency percent per core",
-        "MetricExpr": "(cstate_core@c7\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C7_Core_Residency"
+        "MetricName": "C7_Core_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C2 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c2\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C2_Pkg_Residency"
+        "MetricName": "C2_Pkg_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C3 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c3\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C3_Pkg_Residency"
+        "MetricName": "C3_Pkg_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C6 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c6\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C6_Pkg_Residency"
+        "MetricName": "C6_Pkg_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C7 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c7\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C7_Pkg_Residency"
+        "MetricName": "C7_Pkg_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C8 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c8\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C8_Pkg_Residency"
+        "MetricName": "C8_Pkg_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C9 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c9\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C9_Pkg_Residency"
+        "MetricName": "C9_Pkg_Residency",
+        "ScaleUnit": "100%"
     },
     {
         "BriefDescription": "C10 residency percent per package",
-        "MetricExpr": "(cstate_pkg@c10\\-residency@ / msr@tsc@) * 100",
+        "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC",
         "MetricGroup": "Power",
-        "MetricName": "C10_Pkg_Residency"
+        "MetricName": "C10_Pkg_Residency",
+        "ScaleUnit": "100%"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/cache.json b/tools/perf/pmu-events/arch/x86/alderlake/cache.json
index 2cc62d2779d2..adc9887b8ae0 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/cache.json
@@ -1,1178 +1,871 @@
 [
     {
-        "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x2e",
-        "EventName": "LONGEST_LAT_CACHE.MISS",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x41",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x2e",
-        "EventName": "LONGEST_LAT_CACHE.REFERENCE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x4f",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.IFETCH",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x38",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in DRAM or MMIO (Non-DRAM).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.IFETCH_DRAM_HIT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x20",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.IFETCH_L2_HIT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x8",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the LLC or other core with HITE/F/M.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.IFETCH_LLC_HIT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x10",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.LOAD",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x7",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.LOAD_DRAM_HIT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.LOAD_L2_HIT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the LLC or other core with HITE/F/M.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x34",
-        "EventName": "MEM_BOUND_STALLS.LOAD_LLC_HIT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of load uops retired that hit in DRAM.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd1",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.DRAM_HIT",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "UMask": "0x80",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of load uops retired that hit in the L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd1",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of load uops retired that hit in the L3 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd1",
-        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x04",
-        "EventName": "MEM_SCHEDULER_BLOCK.ALL",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x7",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x04",
-        "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x04",
-        "EventName": "MEM_SCHEDULER_BLOCK.RSV",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x04",
-        "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of load uops retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "UMask": "0x81",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of store uops retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "UMask": "0x82",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x80",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x10",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x100",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x20",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x4",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x200",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x40",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
-        "L1_Hit_Indication": "1",
-        "MSRIndex": "0x3F6",
-        "MSRValue": "0x8",
-        "PEBS": "2",
-        "PEBScounters": "0,1",
-        "SampleAfterValue": "1000003",
-        "TakenAlone": "1",
-        "UMask": "0x5",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of retired split load uops.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "UMask": "0x41",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "Data_LA": "1",
-        "EventCode": "0xd0",
-        "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY",
-        "L1_Hit_Indication": "1",
-        "PEBS": "2",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "UMask": "0x6",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x3F803C0001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x10003C0001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x4003C0001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x8003C0001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_RFO.L3_HIT",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x3F803C0002",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x10003C0002",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to instruction cache misses.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.ICACHE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x20",
-        "Unit": "cpu_atom"
-    },
-    {
         "BriefDescription": "L1D.HWPF_MISS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x51",
         "EventName": "L1D.HWPF_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts the number of cache lines replaced in L1 data cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x51",
         "EventName": "L1D.REPLACEMENT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x48",
         "EventName": "L1D_PEND_MISS.FB_FULL",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailablability.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
+        "BriefDescription": "Number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability.",
         "CounterMask": "1",
         "EdgeDetect": "1",
         "EventCode": "0x48",
         "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event L1D_PEND_MISS.L2_STALLS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
+        "Deprecated": "1",
         "EventCode": "0x48",
         "EventName": "L1D_PEND_MISS.L2_STALL",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of cycles a demand request has waited due to L1D due to lack of L2 resources.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x48",
         "EventName": "L1D_PEND_MISS.L2_STALLS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of L1D misses that are outstanding",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x48",
         "EventName": "L1D_PEND_MISS.PENDING",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles with L1D load Misses outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x48",
         "EventName": "L1D_PEND_MISS.PENDING_CYCLES",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts duration of L1D miss outstanding in cycles.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache lines filling L2",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x25",
         "EventName": "L2_LINES_IN.ALL",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1f",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cache lines that have been L2 hardware prefetched but not used by demand accesses",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x26",
         "EventName": "L2_LINES_OUT.USELESS_HWPF",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "All accesses to L2 cache[This event is alias to L2_RQSTS.REFERENCES]",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_REQUEST.ALL",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses.[This event is alias to L2_RQSTS.REFERENCES]",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xff",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Read requests with true-miss in L2 cache.[This event is alias to L2_RQSTS.MISS]",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_REQUEST.MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses.[This event is alias to L2_RQSTS.MISS]",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x3f",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 code requests",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.ALL_CODE_RD",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the total number of L2 code requests.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xe4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Demand Data Read access L2 cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xe1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Demand requests that miss L2 cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.ALL_DEMAND_MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts demand requests that miss L2 cache.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x27",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2_RQSTS.ALL_HWPF",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.ALL_HWPF",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xf0",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "RFO requests to L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.ALL_RFO",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xe2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache hits when fetching instructions, code reads.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.CODE_RD_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xc4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2 cache misses when fetching instructions",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.CODE_RD_MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts L2 cache misses when fetching instructions.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x24",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Demand Data Read requests that hit L2 cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xc1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Demand Data Read miss L2 cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x21",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "L2_RQSTS.HWPF_MISS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.HWPF_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x30",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Read requests with true-miss in L2 cache.[This event is alias to L2_REQUEST.MISS]",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses.[This event is alias to L2_REQUEST.MISS]",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x3f",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "All accesses to L2 cache[This event is alias to L2_REQUEST.ALL]",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.REFERENCES",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses.[This event is alias to L2_REQUEST.ALL]",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xff",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "RFO requests that hit L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.RFO_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xc2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "RFO requests that miss L2 cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.RFO_MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x22",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "SW prefetch requests that hit L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.SWPF_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0xc8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "SW prefetch requests that miss L2 cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x24",
         "EventName": "L2_RQSTS.SWPF_MISS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x28",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.",
+        "EventCode": "0x2e",
+        "EventName": "LONGEST_LAT_CACHE.MISS",
+        "PublicDescription": "Counts the number of cacheable memory requests that miss in the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x41",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Core-originated cacheable requests that missed L3  (Except hardware prefetches to the L3)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2e",
         "EventName": "LONGEST_LAT_CACHE.MISS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2.  It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x41",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.",
+        "EventCode": "0x2e",
+        "EventName": "LONGEST_LAT_CACHE.REFERENCE",
+        "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4f",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Core-originated cacheable requests that refer to L3 (Except hardware prefetches to the L3)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2e",
         "EventName": "LONGEST_LAT_CACHE.REFERENCE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2.  It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4f",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or translation lookaside buffer (TLB) miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
+        "SampleAfterValue": "200003",
+        "UMask": "0x38",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH_DRAM_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or translation lookaside buffer (TLB) miss which hit in DRAM or MMIO (non-DRAM).",
+        "SampleAfterValue": "200003",
+        "UMask": "0x20",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH_L2_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the L2 cache.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the LLC or other core with HITE/F/M.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH_LLC_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD_DRAM_HIT",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD_L2_HIT",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the LLC or other core with HITE/F/M.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD_LLC_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Retired load instructions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.ALL_LOADS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts all retired load instructions. This event accounts for SW prefetch instructions of PREFETCHNTA or PREFETCHT0/1/2 or PREFETCHW.",
         "SampleAfterValue": "1000003",
         "UMask": "0x81",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired store instructions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.ALL_STORES",
-        "L1_Hit_Indication": "1",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts all retired store instructions.",
         "SampleAfterValue": "1000003",
         "UMask": "0x82",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "All retired memory instructions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.ANY",
-        "L1_Hit_Indication": "1",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts all retired memory instructions - loads and stores.",
         "SampleAfterValue": "1000003",
         "UMask": "0x83",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions with locked access.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.LOCK_LOADS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with locked access.",
         "SampleAfterValue": "100007",
         "UMask": "0x21",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions that split across a cacheline boundary.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.SPLIT_LOADS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions that split across a cacheline boundary.",
         "SampleAfterValue": "100003",
         "UMask": "0x41",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired store instructions that split across a cacheline boundary.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.SPLIT_STORES",
-        "L1_Hit_Indication": "1",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired store instructions that split across a cacheline boundary.",
         "SampleAfterValue": "100003",
         "UMask": "0x42",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions that miss the STLB.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.STLB_MISS_LOADS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB).",
         "SampleAfterValue": "100003",
         "UMask": "0x11",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired store instructions that miss the STLB.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_INST_RETIRED.STLB_MISS_STORES",
-        "L1_Hit_Indication": "1",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB).",
         "SampleAfterValue": "100003",
         "UMask": "0x12",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Completed demand load uops that miss the L1 d-cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x43",
         "EventName": "MEM_LOAD_COMPLETED.L1_MISS_ANY",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss)",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0xfd",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions whose data sources were HitM responses from shared L3",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd2",
         "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3.",
         "SampleAfterValue": "20011",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd2",
         "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache.",
         "SampleAfterValue": "20011",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions whose data sources were HitM responses from shared L3",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd2",
         "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3.",
         "SampleAfterValue": "20011",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd2",
         "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.",
         "SampleAfterValue": "20011",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions whose data sources were hits in L3 without snoops required",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd2",
         "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NONE",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions whose data sources were hits in L3 without snoops required.",
         "SampleAfterValue": "100003",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd2",
         "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache.",
         "SampleAfterValue": "20011",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions which data sources missed L3 but serviced from local dram",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd3",
         "EventName": "MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM",
-        "PEBScounters": "0,1,2,3",
+        "PEBS": "1",
+        "PublicDescription": "Retired load instructions which data sources missed L3 but serviced from local DRAM.",
         "SampleAfterValue": "100007",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions with at least 1 uncacheable load or lock.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd4",
         "EventName": "MEM_LOAD_MISC_RETIRED.UC",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock).",
         "SampleAfterValue": "100007",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.FB_HIT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready.",
         "SampleAfterValue": "100007",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions with L1 cache hits as data sources",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.L1_HIT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source.",
         "SampleAfterValue": "1000003",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions missed L1 cache as data sources",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.L1_MISS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache.",
         "SampleAfterValue": "200003",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions with L2 cache hits as data sources",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.L2_HIT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources.",
         "SampleAfterValue": "200003",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions missed L2 cache as data sources",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.L2_MISS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions missed L2 cache as data sources.",
         "SampleAfterValue": "100021",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions with L3 cache hits as data sources",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.L3_HIT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache.",
         "SampleAfterValue": "100021",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired load instructions missed L3 cache as data sources",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Data_LA": "1",
         "EventCode": "0xd1",
         "EventName": "MEM_LOAD_RETIRED.L3_MISS",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache.",
         "SampleAfterValue": "50021",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of load uops retired that hit in DRAM.",
+        "Data_LA": "1",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.DRAM_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x80",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of load uops retired that hit in the L2 cache.",
+        "Data_LA": "1",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of load uops retired that hit in the L3 cache.",
+        "Data_LA": "1",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ALL",
+        "SampleAfterValue": "20003",
+        "UMask": "0x7",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.RSV",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "MEM_STORE_RETIRED.L2_HIT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x44",
         "EventName": "MEM_STORE_RETIRED.L2_HIT",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "200003",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of load uops retired.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of load uops retired.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x81",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of store uops retired.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of store uops retired.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x82",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x80",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x10",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x100",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x20",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x4",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x200",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x40",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x8",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of retired split load uops.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x41",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled. If PEBS is enabled and a PEBS record is generated, will populate PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Retired memory uops for any access",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe5",
         "EventName": "MEM_UOP_RETIRED.ANY",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses",
         "SampleAfterValue": "1000003",
         "UMask": "0x3",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F803C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10003C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts demand data reads that resulted in a snoop hit in another cores caches, data forwarding is required as the data is modified.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM",
         "MSRIndex": "0x1a6,0x1a7",
@@ -1182,8 +875,27 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x4003C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x8003C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
         "MSRIndex": "0x1a6,0x1a7",
@@ -1193,8 +905,27 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_HIT",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F803C0002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10003C0002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that resulted in a snoop hit in another cores caches, data forwarding is required as the data is modified.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
         "MSRIndex": "0x1a6,0x1a7",
@@ -1205,139 +936,111 @@
     },
     {
         "BriefDescription": "OFFCORE_REQUESTS.ALL_REQUESTS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x80",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Demand and prefetch data reads",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "OFFCORE_REQUESTS.DATA_RD",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Demand Data Read requests sent to uncore",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x21",
         "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event OFFCORE_REQUESTS_OUTSTANDING.DATA_RD",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
+        "Deprecated": "1",
         "Errata": "ADL038",
         "EventCode": "0x20",
         "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "Errata": "ADL038",
         "EventCode": "0x20",
         "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "For every cycle where the core is waiting on at least 1 outstanding Demand RFO request, increments by 1.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x20",
         "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "Errata": "ADL038",
         "EventCode": "0x20",
         "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of PREFETCHNTA instructions executed.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x40",
         "EventName": "SW_PREFETCH_ACCESS.NTA",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of PREFETCHW instructions executed.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x40",
         "EventName": "SW_PREFETCH_ACCESS.PREFETCHW",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of PREFETCHW instructions executed.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of PREFETCHT0 instructions executed.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x40",
         "EventName": "SW_PREFETCH_ACCESS.T0",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of PREFETCHT1 or PREFETCHT2 instructions executed.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x40",
         "EventName": "SW_PREFETCH_ACCESS.T1_T2",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to instruction cache misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ICACHE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20",
+        "Unit": "cpu_atom"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json
index 48a4605fc057..3eb7cab9b431 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json
@@ -1,165 +1,124 @@
 [
     {
-        "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc3",
-        "EventName": "MACHINE_CLEARS.FP_ASSIST",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of floating point divide uops retired (x87 and SSE, including x87 sqrt).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc2",
-        "EventName": "UOPS_RETIRED.FPDIV",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "UMask": "0x8",
-        "Unit": "cpu_atom"
-    },
-    {
         "BriefDescription": "ARITH.FPDIV_ACTIVE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xb0",
         "EventName": "ARITH.FPDIV_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts all microcode FP assists.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc1",
         "EventName": "ASSISTS.FP",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts all microcode Floating Point assists.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "ASSISTS.SSE_AVX_MIX",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc1",
         "EventName": "ASSISTS.SSE_AVX_MIX",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "FP_ARITH_DISPATCHED.PORT_0",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_0",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "FP_ARITH_DISPATCHED.PORT_1",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "FP_ARITH_DISPATCHED.PORT_5",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb3",
         "EventName": "FP_ARITH_DISPATCHED.PORT_5",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 2 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.",
         "SampleAfterValue": "100003",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.",
         "SampleAfterValue": "100003",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 4 computation operations, one for each element.  Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.",
         "SampleAfterValue": "100003",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 8 computation operations, one for each element.  Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB.  DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.",
         "SampleAfterValue": "100003",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.",
         "SampleAfterValue": "100003",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc7",
         "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below.  Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB.  FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.",
         "SampleAfterValue": "100003",
         "UMask": "0x2",
         "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.FP_ASSIST",
+        "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point divide uops retired (x87 and SSE, including x87 sqrt).",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.FPDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
index da1a7ba0e568..250cd128b674 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
@@ -1,536 +1,416 @@
 [
     {
         "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xe6",
         "EventName": "BACLEARS.ANY",
-        "PEBScounters": "0,1,2,3,4,5",
+        "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend.  Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of requests to the instruction cache for one or more bytes of a cache line.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x80",
-        "EventName": "ICACHE.ACCESSES",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x3",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of instruction cache misses.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x80",
-        "EventName": "ICACHE.MISSES",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
         "BriefDescription": "Stalls caused by changing prefix length of the instruction.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x87",
         "EventName": "DECODE.LCP",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.",
         "SampleAfterValue": "500009",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles the Microcode Sequencer is busy.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x87",
         "EventName": "DECODE.MS_BUSY",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "500009",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "DSB-to-MITE switch true penalty cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x61",
         "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired Instructions who experienced DSB miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.ANY_DSB_MISS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x1",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired Instructions who experienced a critical DSB miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.DSB_MISS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x11",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired Instructions who experienced iTLB true miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.ITLB_MISS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x14",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired Instructions who experienced Instruction L1 Cache true miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.L1I_MISS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x12",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired Instructions who experienced Instruction L2 Cache true miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.L2_MISS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x13",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions after front-end starvation of at least 1 cycle",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_1",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x600106",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 1 cycle which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_128",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x608006",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_16",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x601006",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions after front-end starvation of at least 2 cycles",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_2",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x600206",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_256",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x610006",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x100206",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_32",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x602006",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_4",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x600406",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_512",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x620006",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_64",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x604006",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.LATENCY_GE_8",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x600806",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "FRONTEND_RETIRED.MS_FLOWS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.MS_FLOWS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x8",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired Instructions who experienced STLB (2nd level TLB) true miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.STLB_MISS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x15",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "FRONTEND_RETIRED.UNKNOWN_BRANCH",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc6",
         "EventName": "FRONTEND_RETIRED.UNKNOWN_BRANCH",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x17",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of requests to the instruction cache for one or more bytes of a cache line.",
+        "EventCode": "0x80",
+        "EventName": "ICACHE.ACCESSES",
+        "PublicDescription": "Counts the total number of requests to the instruction cache.  The event only counts new cache line accesses, so that multiple back to back fetches to the exact same cache line or byte chunk count as one.  Specifically, the event counts when accesses from sequential code crosses the cache line boundary, or when a branch target is moved to a new line or to a non-sequential byte chunk of the same line.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of instruction cache misses.",
+        "EventCode": "0x80",
+        "EventName": "ICACHE.MISSES",
+        "PublicDescription": "Counts the number of missed requests to the instruction cache.  The event only counts new cache line accesses, so that multiple back to back fetches to the exact same cache line and byte chunk count as one.  Specifically, the event counts when accesses from sequential code crosses the cache line boundary, or when a branch target is moved to a new line or to a non-sequential byte chunk of the same line.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x80",
         "EventName": "ICACHE_DATA.STALLS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.",
         "SampleAfterValue": "500009",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache tag miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x83",
         "EventName": "ICACHE_TAG.STALLS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x79",
         "EventName": "IDQ.DSB_CYCLES_ANY",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles DSB is delivering optimal number of Uops",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "6",
         "EventCode": "0x79",
         "EventName": "IDQ.DSB_CYCLES_OK",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x79",
         "EventName": "IDQ.DSB_UOPS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles MITE is delivering any Uop",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x79",
         "EventName": "IDQ.MITE_CYCLES_ANY",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles MITE is delivering optimal number of Uops",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "6",
         "EventCode": "0x79",
         "EventName": "IDQ.MITE_CYCLES_OK",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from MITE path",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x79",
         "EventName": "IDQ.MITE_UOPS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles when uops are being delivered to IDQ while MS is busy",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x79",
         "EventName": "IDQ.MS_CYCLES_ANY",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of switches from DSB or MITE to the MS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EdgeDetect": "1",
         "EventCode": "0x79",
         "EventName": "IDQ.MS_SWITCHES",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops delivered to IDQ while MS is busy",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x79",
         "EventName": "IDQ.MS_UOPS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS).",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops not delivered by IDQ when backend of the machine is not stalled",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x9c",
         "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "6",
         "EventCode": "0x9c",
         "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0x9c",
         "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     }
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/memory.json b/tools/perf/pmu-events/arch/x86/alderlake/memory.json
index f894e4a0212b..7595eb4ab46f 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/memory.json
@@ -1,339 +1,234 @@
 [
     {
+        "BriefDescription": "Execution stalls while L3 cache miss demand load is outstanding.",
+        "CounterMask": "6",
+        "EventCode": "0xa3",
+        "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0x05",
         "EventName": "LD_HEAD.ANY_AT_RET",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0xff",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0x05",
         "EventName": "LD_HEAD.L1_BOUND_AT_RET",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0xf4",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0x05",
         "EventName": "LD_HEAD.OTHER_AT_RET",
-        "PEBScounters": "0,1,2,3,4,5",
+        "PublicDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases such as pipeline conflicts, fences, etc.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0xc0",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0x05",
         "EventName": "LD_HEAD.PGWALK_AT_RET",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0xa0",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0x05",
         "EventName": "LD_HEAD.ST_ADDR_AT_RET",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x84",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc3",
         "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "20003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x3F84400001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x3F84400001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_RFO.L3_MISS",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x3F84400002",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_RFO.L3_MISS_LOCAL",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x3F84400002",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Execution stalls while L3 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
-        "CounterMask": "6",
-        "EventCode": "0xa3",
-        "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS",
-        "PEBScounters": "0,1,2,3",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x6",
-        "Unit": "cpu_core"
-    },
-    {
         "BriefDescription": "Number of machine clears due to memory ordering conflicts.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc3",
         "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles while L1 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "2",
         "EventCode": "0x47",
         "EventName": "MEMORY_ACTIVITY.CYCLES_L1D_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Execution stalls while L1 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "3",
         "EventCode": "0x47",
         "EventName": "MEMORY_ACTIVITY.STALLS_L1D_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x3",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "MEMORY_ACTIVITY.STALLS_L2_MISS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "5",
         "EventCode": "0x47",
         "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x5",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "MEMORY_ACTIVITY.STALLS_L3_MISS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "9",
         "EventCode": "0x47",
         "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x9",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x80",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "1009",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x10",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "20011",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x100",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "503",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x20",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "100007",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x4",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "100003",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x200",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "101",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x40",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "2003",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x8",
         "PEBS": "2",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles.  Reported latency may be longer than just the memory latency.",
         "SampleAfterValue": "50021",
-        "TakenAlone": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired memory store access operations. A PDist event for PEBS Store Latency Facility.",
-        "CollectPEBSRecord": "2",
         "Data_LA": "1",
         "EventCode": "0xcd",
         "EventName": "MEM_TRANS_RETIRED.STORE_SAMPLE",
         "PEBS": "2",
+        "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8",
         "SampleAfterValue": "1000003",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
         "MSRIndex": "0x1a6,0x1a7",
@@ -343,8 +238,27 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_MISS",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_RFO.L3_MISS",
         "MSRIndex": "0x1a6,0x1a7",
@@ -352,5 +266,33 @@
         "SampleAfterValue": "100003",
         "UMask": "0x1",
         "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_MISS_LOCAL",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Demand Data Read requests who miss L3 cache",
+        "EventCode": "0x21",
+        "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD",
+        "PublicDescription": "Demand Data Read requests who miss L3 cache.",
+        "SampleAfterValue": "100003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.",
+        "EventCode": "0x20",
+        "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD",
+        "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.  Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/other.json b/tools/perf/pmu-events/arch/x86/alderlake/other.json
index c49d8ce27310..329c611d7cf7 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/other.json
@@ -1,111 +1,66 @@
 [
     {
-        "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.COREWB_M.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x10008",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand data reads that have any type of response.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x10001",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x10002",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts streaming stores that have any type of response.",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xB7",
-        "EventName": "OCR.STREAMING_WR.ANY_RESPONSE",
-        "MSRIndex": "0x1a6,0x1a7",
-        "MSRValue": "0x10800",
-        "SampleAfterValue": "100003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
         "BriefDescription": "ASSISTS.HARDWARE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc1",
         "EventName": "ASSISTS.HARDWARE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "ASSISTS.PAGE_FAULT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc1",
         "EventName": "ASSISTS.PAGE_FAULT",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "CORE_POWER.LICENSE_1",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x28",
         "EventName": "CORE_POWER.LICENSE_1",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "CORE_POWER.LICENSE_2",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x28",
         "EventName": "CORE_POWER.LICENSE_2",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "CORE_POWER.LICENSE_3",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x28",
         "EventName": "CORE_POWER.LICENSE_3",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "200003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.COREWB_M.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10008",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts demand data reads that have any type of response.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
         "MSRIndex": "0x1a6,0x1a7",
@@ -116,7 +71,6 @@
     },
     {
         "BriefDescription": "Counts demand data reads that were supplied by DRAM.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_DATA_RD.DRAM",
         "MSRIndex": "0x1a6,0x1a7",
@@ -126,8 +80,17 @@
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE",
         "MSRIndex": "0x1a6,0x1a7",
@@ -138,7 +101,16 @@
     },
     {
         "BriefDescription": "Counts streaming stores that have any type of response.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xB7",
+        "EventName": "OCR.STREAMING_WR.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10800",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts streaming stores that have any type of response.",
         "EventCode": "0x2A,0x2B",
         "EventName": "OCR.STREAMING_WR.ANY_RESPONSE",
         "MSRIndex": "0x1a6,0x1a7",
@@ -149,68 +121,52 @@
     },
     {
         "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa5",
         "EventName": "RS.EMPTY",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x7",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts end of periods where the Reservation Station (RS) was empty.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EdgeDetect": "1",
         "EventCode": "0xa5",
         "EventName": "RS.EMPTY_COUNT",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x7",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event RS.EMPTY_COUNT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
+        "Deprecated": "1",
         "EdgeDetect": "1",
         "EventCode": "0xa5",
         "EventName": "RS_EMPTY.COUNT",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x7",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event RS.EMPTY",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "Deprecated": "1",
         "EventCode": "0xa5",
         "EventName": "RS_EMPTY.CYCLES",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x7",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "XQ.FULL_CYCLES",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x2d",
         "EventName": "XQ.FULL_CYCLES",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     }
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
index 1a137f7f8b7e..f46fa7ba168a 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
@@ -1,2168 +1,1634 @@
 [
     {
+        "BriefDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE",
+        "CounterMask": "1",
+        "Deprecated": "1",
+        "EventCode": "0xb0",
+        "EventName": "ARITH.DIVIDER_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x9",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations.",
+        "CounterMask": "1",
+        "EventCode": "0xb0",
+        "EventName": "ARITH.DIV_ACTIVE",
+        "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x9",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event ARITH.FPDIV_ACTIVE",
+        "CounterMask": "1",
+        "Deprecated": "1",
+        "EventCode": "0xb0",
+        "EventName": "ARITH.FP_DIVIDER_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This event counts the cycles the integer divider is busy.",
+        "EventCode": "0xb0",
+        "EventName": "ARITH.IDIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event ARITH.IDIV_ACTIVE",
+        "CounterMask": "1",
+        "Deprecated": "1",
+        "EventCode": "0xb0",
+        "EventName": "ARITH.INT_DIVIDER_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Number of occurrences where a microcode assist is invoked by hardware.",
+        "EventCode": "0xc1",
+        "EventName": "ASSISTS.ANY",
+        "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1b",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the total number of branch instructions retired for all branch types.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
+        "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires.  All branch type instructions are accounted for.",
         "SampleAfterValue": "200003",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "All branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
+        "PEBS": "1",
+        "PublicDescription": "Counts all branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.NEAR_CALL",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xf9",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.COND",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0x7e",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Conditional branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND",
+        "PEBS": "1",
+        "PublicDescription": "Counts conditional branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x11",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Not taken branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_NTAKEN",
+        "PEBS": "1",
+        "PublicDescription": "Counts not taken branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.COND_TAKEN",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfe",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Taken conditional branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "PublicDescription": "Counts taken conditional branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x1",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.FAR_BRANCH",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xbf",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Far branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
+        "PEBS": "1",
+        "PublicDescription": "Counts far branch instructions retired.",
+        "SampleAfterValue": "100007",
+        "UMask": "0x40",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.INDIRECT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xeb",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Indirect near branch instructions retired (excluding returns)",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch.",
+        "SampleAfterValue": "100003",
+        "UMask": "0x80",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.INDIRECT_CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfb",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.IND_CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfb",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.COND",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.JCC",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0x7e",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of near CALL branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.NEAR_CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xf9",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Direct and indirect near call instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_CALL",
+        "PEBS": "1",
+        "PublicDescription": "Counts both direct and indirect near call instructions retired.",
+        "SampleAfterValue": "100007",
+        "UMask": "0x2",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of near RET branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.NEAR_RETURN",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xf7",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Return instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "PEBS": "1",
+        "PublicDescription": "Counts return instructions retired.",
+        "SampleAfterValue": "100007",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Taken branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_TAKEN",
+        "PEBS": "1",
+        "PublicDescription": "Counts taken branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x20",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.NON_RETURN_IND",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xeb",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of near relative CALL branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.REL_CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfd",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.NEAR_RETURN",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.RETURN",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xf7",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.COND_TAKEN",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.TAKEN_JCC",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfe",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
+        "PublicDescription": "Counts the total number of mispredicted branch instructions retired.  All branch type instructions are accounted for.  Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP.    A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path.",
         "SampleAfterValue": "200003",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "All mispredicted branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
+        "PEBS": "1",
+        "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch.  When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path.",
+        "SampleAfterValue": "400009",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.COND",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0x7e",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Mispredicted conditional branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND",
+        "PEBS": "1",
+        "PublicDescription": "Counts mispredicted conditional branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x11",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Mispredicted non-taken conditional branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND_NTAKEN",
+        "PEBS": "1",
+        "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.COND_TAKEN",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfe",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "number of branch instructions retired that were mispredicted and taken.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "PublicDescription": "Counts taken conditional mispredicted branch instructions retired.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x1",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.INDIRECT",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xeb",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "Counts the number of mispredicted near indirect CALL branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfb",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Mispredicted indirect CALL retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x2",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.INDIRECT_CALL",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.IND_CALL",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfb",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.COND",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.JCC",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0x7e",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "Number of near branch instructions retired that were mispredicted and taken.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
+        "PEBS": "1",
+        "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken.",
+        "SampleAfterValue": "400009",
+        "UMask": "0x20",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.INDIRECT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.NON_RETURN_IND",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xeb",
         "Unit": "cpu_atom"
     },
     {
+        "BriefDescription": "This event counts the number of mispredicted ret instructions retired. Non PEBS",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.RET",
+        "PEBS": "1",
+        "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired.",
+        "SampleAfterValue": "100007",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
+    {
         "BriefDescription": "Counts the number of mispredicted near RET branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.RETURN",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xf7",
         "Unit": "cpu_atom"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.COND_TAKEN",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
+        "Deprecated": "1",
         "EventCode": "0xc5",
         "EventName": "BR_MISP_RETIRED.TAKEN_JCC",
         "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
         "SampleAfterValue": "200003",
         "UMask": "0xfe",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 1",
-        "EventName": "CPU_CLK_UNHALTED.CORE",
-        "PEBScounters": "33",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of unhalted core clock cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x3c",
-        "EventName": "CPU_CLK_UNHALTED.CORE_P",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency. (Fixed event)",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 2",
-        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
-        "PEBScounters": "34",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "UMask": "0x3",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x3c",
-        "EventName": "CPU_CLK_UNHALTED.REF_TSC_P",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 1",
-        "EventName": "CPU_CLK_UNHALTED.THREAD",
-        "PEBScounters": "33",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of unhalted core clock cycles.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x3c",
-        "EventName": "CPU_CLK_UNHALTED.THREAD_P",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of instructions retired. (Fixed event)",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 0",
-        "EventName": "INST_RETIRED.ANY",
-        "PEBS": "1",
-        "PEBScounters": "32",
-        "SampleAfterValue": "2000003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc0",
-        "EventName": "INST_RETIRED.ANY_P",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "This event is deprecated. Refer to new event LD_BLOCKS.ADDRESS_ALIAS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x03",
-        "EventName": "LD_BLOCKS.4K_ALIAS",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x03",
-        "EventName": "LD_BLOCKS.ADDRESS_ALIAS",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x03",
-        "EventName": "LD_BLOCKS.DATA_UNKNOWN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc3",
-        "EventName": "MACHINE_CLEARS.DISAMBIGUATION",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x8",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of machines clears due to memory renaming.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc3",
-        "EventName": "MACHINE_CLEARS.MRN_NUKE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x80",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of machine clears due to a page fault.  Counts both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation occurs.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc3",
-        "EventName": "MACHINE_CLEARS.PAGE_FAULT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x20",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc3",
-        "EventName": "MACHINE_CLEARS.SLOW",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x6f",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc3",
-        "EventName": "MACHINE_CLEARS.SMC",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "20003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x75",
-        "EventName": "SERIALIZATION.NON_C01_MS_SCB",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x73",
-        "EventName": "TOPDOWN_BAD_SPECULATION.ALL",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to fast nukes such as memory ordering and memory disambiguation machine clears.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x73",
-        "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x73",
-        "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x3",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to branch mispredicts.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x73",
-        "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x73",
-        "EventName": "TOPDOWN_BAD_SPECULATION.NUKE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of issue slots every cycle that were not consumed by the backend due to backend stalls.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.ALL",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to certain allocation restrictions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x8",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.REGISTER",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x20",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to the reorder buffer being full (ROB stalls).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x40",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x74",
-        "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x10",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of issue slots every cycle that were not consumed by the backend due to frontend stalls.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.ALL",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BACLEARS.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTCLEARS.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x40",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to the microcode sequencer (MS).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.CISC",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stalls.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.DECODE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x8",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x8d",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to a latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x72",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ITLB misses.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.ITLB",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x10",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to other common frontend stalls not categorized.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.OTHER",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x80",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to wrong predecodes.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x71",
-        "EventName": "TOPDOWN_FE_BOUND.PREDECODE",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x4",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of consumed retirement slots.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc2",
-        "EventName": "TOPDOWN_RETIRING.ALL",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the total number of uops retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc2",
-        "EventName": "UOPS_RETIRED.ALL",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of integer divide uops retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc2",
-        "EventName": "UOPS_RETIRED.IDIV",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "UMask": "0x10",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of uops that are from complex flows issued by the micro-sequencer (MS).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc2",
-        "EventName": "UOPS_RETIRED.MS",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of x87 uops retired, includes those in MS flows.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0xc2",
-        "EventName": "UOPS_RETIRED.X87",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "UMask": "0x2",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "CounterMask": "1",
-        "EventCode": "0xb0",
-        "EventName": "ARITH.DIVIDER_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x9",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "CounterMask": "1",
-        "EventCode": "0xb0",
-        "EventName": "ARITH.DIV_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x9",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "This event is deprecated. Refer to new event ARITH.FPDIV_ACTIVE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "CounterMask": "1",
-        "EventCode": "0xb0",
-        "EventName": "ARITH.FP_DIVIDER_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "This event counts the cycles the integer divider is busy.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xb0",
-        "EventName": "ARITH.IDIV_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x8",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "This event is deprecated. Refer to new event ARITH.IDIV_ACTIVE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "CounterMask": "1",
-        "EventCode": "0xb0",
-        "EventName": "ARITH.INT_DIVIDER_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x8",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Number of occurrences where a microcode assist is invoked by hardware.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc1",
-        "EventName": "ASSISTS.ANY",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "100003",
-        "Speculative": "1",
-        "UMask": "0x1b",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "All branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Conditional branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.COND",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x11",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Not taken branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.COND_NTAKEN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x10",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Taken conditional branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.COND_TAKEN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x1",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Far branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "100007",
-        "UMask": "0x40",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Indirect near branch instructions retired (excluding returns)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.INDIRECT",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "100003",
-        "UMask": "0x80",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Direct and indirect near call instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.NEAR_CALL",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "100007",
-        "UMask": "0x2",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Return instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "100007",
-        "UMask": "0x8",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Taken branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc4",
-        "EventName": "BR_INST_RETIRED.NEAR_TAKEN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x20",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "All mispredicted branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Mispredicted conditional branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.COND",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x11",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Mispredicted non-taken conditional branch instructions retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.COND_NTAKEN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x10",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "number of branch instructions retired that were mispredicted and taken.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.COND_TAKEN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x1",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Mispredicted indirect CALL retired.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x2",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "Number of near branch instructions retired that were mispredicted and taken.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "400009",
-        "UMask": "0x20",
-        "Unit": "cpu_core"
-    },
-    {
-        "BriefDescription": "This event counts the number of mispredicted ret instructions retired. Non PEBS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
-        "EventCode": "0xc5",
-        "EventName": "BR_MISP_RETIRED.RET",
-        "PEBS": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
-        "SampleAfterValue": "100007",
-        "UMask": "0x8",
-        "Unit": "cpu_core"
-    },
-    {
         "BriefDescription": "Core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xec",
         "EventName": "CPU_CLK_UNHALTED.C01",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.  This state can be entered via the TPAUSE or UMWAIT instructions.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xec",
         "EventName": "CPU_CLK_UNHALTED.C02",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.  This state can be entered via the TPAUSE or UMWAIT instructions.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Core clocks when the thread is in the C0.1 or C0.2 or running a PAUSE in C0 ACPI state.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xec",
         "EventName": "CPU_CLK_UNHALTED.C0_WAIT",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x70",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "EventName": "CPU_CLK_UNHALTED.CORE",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "EventCode": "0x3c",
+        "EventName": "CPU_CLK_UNHALTED.CORE_P",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Cycle counts are evenly distributed between active threads in the Core.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xec",
         "EventName": "CPU_CLK_UNHALTED.DISTRIBUTED",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0.  A hyperthread becomes inactive when it executes the HLT or MWAIT instructions.  If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Core crystal clock cycles when this thread is unhalted and the other thread is halted.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted.",
         "SampleAfterValue": "25003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "CPU_CLK_UNHALTED.PAUSE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xec",
         "EventName": "CPU_CLK_UNHALTED.PAUSE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "CPU_CLK_UNHALTED.PAUSE_INST",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EdgeDetect": "1",
         "EventCode": "0xec",
         "EventName": "CPU_CLK_UNHALTED.PAUSE_INST",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Core crystal clock cycles. Cycle counts are evenly distributed between active threads in the Core.",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.REF_DISTRIBUTED",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency. (Fixed event)",
+        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
+        "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses fixed counter 2.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Reference cycles when the core is not in halt state.",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
-        "PEBScounters": "34",
+        "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'.  The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'.  After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x3",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.",
+        "EventCode": "0x3c",
+        "EventName": "CPU_CLK_UNHALTED.REF_TSC_P",
+        "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Reference cycles when the core is not in halt state.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC_P",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'.  The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'.  After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "EventName": "CPU_CLK_UNHALTED.THREAD",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time.  This event uses fixed counter 1.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Core cycles when the thread is not in halt state",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
-        "PEBScounters": "33",
+        "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "EventCode": "0x3c",
+        "EventName": "CPU_CLK_UNHALTED.THREAD_P",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Thread cycles when thread is not in halt state",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.THREAD_P",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles while L1 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "8",
         "EventCode": "0xa3",
         "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles while L2 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0xa3",
         "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles while memory subsystem has an outstanding load.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "16",
         "EventCode": "0xa3",
         "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Execution stalls while L1 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "12",
         "EventCode": "0xa3",
         "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0xc",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Execution stalls while L2 cache miss demand load is outstanding.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "5",
         "EventCode": "0xa3",
         "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x5",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Total execution stalls.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "4",
         "EventCode": "0xa3",
         "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.1_PORTS_UTIL",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.2_PORTS_UTIL",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.3_PORTS_UTIL",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.4_PORTS_UTIL",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Execution stalls while memory subsystem has an outstanding load.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "5",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x21",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles where the Store Buffer was full and no loads caused an execution stall.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "2",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.BOUND_ON_STORES",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles no uop executed while RS was not empty, the SB was not full and there was no outstanding load.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa6",
         "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x80",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Instruction decoders utilized in a cycle",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x75",
         "EventName": "INST_DECODED.DECODERS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the total number of instructions retired. (Fixed event)",
+        "EventName": "INST_RETIRED.ANY",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of instructions that retired. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. This event continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Number of instructions retired. Fixed Counter - architectural event",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
         "PEBS": "1",
-        "PEBScounters": "32",
+        "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter.",
         "SampleAfterValue": "2000003",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the total number of instructions retired.",
+        "EventCode": "0xc0",
+        "EventName": "INST_RETIRED.ANY_P",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of instructions that retired. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. This event continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Number of instructions retired. General Counter - architectural event",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.ANY_P",
         "PEBS": "1",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter.",
         "SampleAfterValue": "2000003",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INST_RETIRED.MACRO_FUSED",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.MACRO_FUSED",
-        "PEBScounters": "1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired NOP instructions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.NOP",
-        "PEBScounters": "1,2,3,4,5,6,7",
+        "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions",
         "SampleAfterValue": "2000003",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Precise instruction retired with PEBS precise-distribution",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.PREC_DIST",
         "PEBS": "1",
-        "PEBScounters": "32",
+        "PublicDescription": "A version of INST_RETIRED that allows for a precise distribution of samples across instructions retired. It utilizes the Precise Distribution of Instructions Retired (PDIR++) feature to fix bias in how retired instructions get sampled. Use on Fixed Counter 0.",
         "SampleAfterValue": "2000003",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INST_RETIRED.REP_ITERATION",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc0",
         "EventName": "INST_RETIRED.REP_ITERATION",
-        "PEBScounters": "1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xad",
         "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.",
         "SampleAfterValue": "500009",
-        "Speculative": "1",
         "UMask": "0x80",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Core cycles the allocator was stalled due to recovery from earlier clear event for this thread",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xad",
         "EventName": "INT_MISC.RECOVERY_CYCLES",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.",
         "SampleAfterValue": "500009",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_MISC.UNKNOWN_BRANCH_CYCLES",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xad",
         "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x7",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "TakenAlone": "1",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "TMA slots where uops got dropped",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xad",
         "EventName": "INT_MISC.UOP_DROPPING",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_VEC_RETIRED.128BIT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.128BIT",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0x13",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_VEC_RETIRED.256BIT",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.256BIT",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0xac",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "integer ADD, SUB, SAD 128-bit vector instructions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.ADD_128",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.",
         "SampleAfterValue": "1000003",
         "UMask": "0x3",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "integer ADD, SUB, SAD 256-bit vector instructions.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.ADD_256",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.",
         "SampleAfterValue": "1000003",
         "UMask": "0xc",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_VEC_RETIRED.MUL_256",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.MUL_256",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0x80",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_VEC_RETIRED.SHUFFLES",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.SHUFFLES",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_VEC_RETIRED.VNNI_128",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.VNNI_128",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "INT_VEC_RETIRED.VNNI_256",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe7",
         "EventName": "INT_VEC_RETIRED.VNNI_256",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "This event is deprecated. Refer to new event LD_BLOCKS.ADDRESS_ALIAS",
+        "Deprecated": "1",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.4K_ALIAS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.ADDRESS_ALIAS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "False dependencies in MOB due to partial compare on address.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x03",
         "EventName": "LD_BLOCKS.ADDRESS_ALIAS",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.DATA_UNKNOWN",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x03",
         "EventName": "LD_BLOCKS.NO_SR",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x88",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Loads blocked due to overlapping with a preceding store that cannot be forwarded.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x03",
         "EventName": "LD_BLOCKS.STORE_FORWARD",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x82",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts the number of demand load dispatches that hit L1D fill buffer (FB) allocated for software prefetch.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x4c",
         "EventName": "LOAD_HIT_PREFETCH.SWPF",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts all not software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles Uops delivered by the LSD, but didn't come from the decoder.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xa8",
         "EventName": "LSD.CYCLES_ACTIVE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles optimal number of Uops delivered by the LSD, but did not come from the decoder.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "6",
         "EventCode": "0xa8",
         "EventName": "LSD.CYCLES_OK",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of Uops delivered by the LSD.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa8",
         "EventName": "LSD.UOPS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of machine clears (nukes) of any type.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EdgeDetect": "1",
         "EventCode": "0xc3",
         "EventName": "MACHINE_CLEARS.COUNT",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of machine clears (nukes) of any type.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.DISAMBIGUATION",
+        "SampleAfterValue": "20003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machines clears due to memory renaming.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.MRN_NUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x80",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to a page fault.  Counts both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation occurs.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.PAGE_FAULT",
+        "SampleAfterValue": "20003",
+        "UMask": "0x20",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SLOW",
+        "SampleAfterValue": "20003",
+        "UMask": "0x6f",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SMC",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Self-modifying code (SMC) detected.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc3",
         "EventName": "MACHINE_CLEARS.SMC",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "MISC2_RETIRED.LFENCE",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xe0",
         "EventName": "MISC2_RETIRED.LFENCE",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "400009",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Increments whenever there is an update to the LBR array.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xcc",
         "EventName": "MISC_RETIRED.LBR_INSERTS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.",
         "SampleAfterValue": "100003",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles stalled due to no store buffers available. (not including draining form sync).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa2",
         "EventName": "RESOURCE_STALLS.SB",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts cycles where the pipeline is stalled due to serializing operations.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa2",
         "EventName": "RESOURCE_STALLS.SCOREBOARD",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires.",
+        "EventCode": "0x75",
+        "EventName": "SERIALIZATION.NON_C01_MS_SCB",
+        "PublicDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires. The most commonly executed instruction with an MS scoreboard is PAUSE.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "TMA slots where no uops were being issued due to lack of back-end resources.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa4",
         "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources.",
         "SampleAfterValue": "10000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "TMA slots wasted due to incorrect speculations.",
-        "CollectPEBSRecord": "2",
         "EventCode": "0xa4",
         "EventName": "TOPDOWN.BAD_SPEC_SLOTS",
+        "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.",
         "SampleAfterValue": "10000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions",
-        "CollectPEBSRecord": "2",
         "EventCode": "0xa4",
         "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS",
+        "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of specualtive operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.",
         "SampleAfterValue": "10000003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "TOPDOWN.MEMORY_BOUND_SLOTS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa4",
         "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "10000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event",
-        "CollectPEBSRecord": "2",
-        "Counter": "Fixed counter 3",
         "EventName": "TOPDOWN.SLOTS",
-        "PEBScounters": "35",
+        "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).",
         "SampleAfterValue": "10000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "TMA slots available for an unhalted logical processor. General counter - architectural event",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xa4",
         "EventName": "TOPDOWN.SLOTS_P",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.",
         "SampleAfterValue": "10000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.ALL",
+        "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ) even if an FE_bound event occurs during this period. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
+        "SampleAfterValue": "1000003",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to fast nukes such as memory ordering and memory disambiguation machine clears.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to branch mispredicts.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.NUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots every cycle that were not consumed by the backend due to backend stalls.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.ALL",
+        "SampleAfterValue": "1000003",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to certain allocation restrictions.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REGISTER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to the reorder buffer being full (ROB stalls).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots every cycle that were not consumed by the backend due to frontend stalls.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ALL",
+        "SampleAfterValue": "1000003",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BACLEARS.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT",
+        "PublicDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTCLEARS.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER",
+        "PublicDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to the microcode sequencer (MS).",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.CISC",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stalls.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.DECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8d",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to a latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x72",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ITLB misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ITLB",
+        "PublicDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to other common frontend stalls not categorized.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.OTHER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x80",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to wrong predecodes.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.PREDECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the total number of consumed retirement slots.",
+        "EventCode": "0xc2",
+        "EventName": "TOPDOWN_RETIRING.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "UOPS_DECODED.DEC0_UOPS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x76",
         "EventName": "UOPS_DECODED.DEC0_UOPS",
-        "PEBScounters": "0,1,2,3",
         "SampleAfterValue": "1000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on port 0",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_0",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution  port 0.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on port 1",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution  port 1.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on ports 2, 3 and 10",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_2_3_10",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on ports 4 and 9",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_4_9",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution ports 4 and 9",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on ports 5 and 11",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_5_11",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution ports 5 and 11",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on port 6",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_6",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution  port 6.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops executed on ports 7 and 8",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb2",
         "EventName": "UOPS_DISPATCHED.PORT_7_8",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Number of uops dispatch to execution  ports 7 and 8.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x80",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles at least 1 micro-op is executed from any thread on physical core.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles at least 2 micro-op is executed from any thread on physical core.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "2",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles at least 3 micro-op is executed from any thread on physical core.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "3",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles at least 4 micro-op is executed from any thread on physical core.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "4",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles where at least 1 uop was executed per-thread",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CYCLES_GE_1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles where at least 1 uop was executed per-thread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles where at least 2 uops were executed per-thread",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "2",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CYCLES_GE_2",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles where at least 2 uops were executed per-thread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles where at least 3 uops were executed per-thread",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "3",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CYCLES_GE_3",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles where at least 3 uops were executed per-thread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles where at least 4 uops were executed per-thread",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "4",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.CYCLES_GE_4",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Cycles where at least 4 uops were executed per-thread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts number of cycles no uops were dispatched to be executed on this thread.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.STALLS",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event UOPS_EXECUTED.STALLS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
+        "Deprecated": "1",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.STALL_CYCLES",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts the number of uops to be executed per-thread each cycle.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.THREAD",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Counts the number of x87 uops dispatched.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xb1",
         "EventName": "UOPS_EXECUTED.X87",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of x87 uops executed.",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Uops that RAT issues to RS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xae",
         "EventName": "UOPS_ISSUED.ANY",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).",
         "SampleAfterValue": "2000003",
-        "Speculative": "1",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the total number of uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Cycles with retired uop(s).",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xc2",
         "EventName": "UOPS_RETIRED.CYCLES",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts cycles where at least one uop has retired.",
         "SampleAfterValue": "1000003",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retired uops except the last uop of each instruction.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc2",
         "EventName": "UOPS_RETIRED.HEAVY",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.",
         "SampleAfterValue": "2000003",
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of integer divide uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.IDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of uops that are from complex flows issued by the micro-sequencer (MS).",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.MS",
+        "PEBS": "1",
+        "PublicDescription": "Counts the number of uops that are from complex flows issued by the Microcode Sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "UOPS_RETIRED.MS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc2",
         "EventName": "UOPS_RETIRED.MS",
         "MSRIndex": "0x3F7",
         "MSRValue": "0x8",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "2000003",
-        "TakenAlone": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Retirement slots used.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "EventCode": "0xc2",
         "EventName": "UOPS_RETIRED.SLOTS",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "Counts the retirement slots used each cycle.",
         "SampleAfterValue": "2000003",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles without actually retired uops.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
         "EventCode": "0xc2",
         "EventName": "UOPS_RETIRED.STALLS",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
+        "PublicDescription": "This event counts cycles without actually retired uops.",
         "SampleAfterValue": "1000003",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "This event is deprecated. Refer to new event UOPS_RETIRED.STALLS",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5,6,7",
         "CounterMask": "1",
+        "Deprecated": "1",
         "EventCode": "0xc2",
         "EventName": "UOPS_RETIRED.STALL_CYCLES",
         "Invert": "1",
-        "PEBScounters": "0,1,2,3,4,5,6,7",
         "SampleAfterValue": "1000003",
         "UMask": "0x2",
         "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Counts the number of x87 uops retired, includes those in MS flows.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.X87",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/uncore-memory.json b/tools/perf/pmu-events/arch/x86/alderlake/uncore-memory.json
index d82d6f62a6fb..2ccd9cf96957 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/uncore-memory.json
@@ -1,221 +1,174 @@
 [
     {
-        "BriefDescription": "Number of clocks",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x01",
-        "EventName": "UNC_M_CLOCKTICKS",
+        "BriefDescription": "Counts every 64B read  request entering the Memory Controller 0 to DRAM (sum of all channels).",
+        "EventName": "UNC_MC0_RDCAS_COUNT_FREERUN",
         "PerPkg": "1",
+        "PublicDescription": "Counts every 64B read request entering the Memory Controller 0 to DRAM (sum of all channels).",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Incoming VC0 read request",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x02",
-        "EventName": "UNC_M_VC0_REQUESTS_RD",
+        "BriefDescription": "Counts every 64B write request entering the Memory Controller 0 to DRAM (sum of all channels). Each write request counts as a new request incrementing this counter. However, same cache line write requests (both full and partial) are combined to a single 64 byte data transfer to DRAM.",
+        "EventName": "UNC_MC0_WRCAS_COUNT_FREERUN",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Incoming VC0 write request",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x03",
-        "EventName": "UNC_M_VC0_REQUESTS_WR",
+        "BriefDescription": "Counts every 64B read request entering the Memory Controller 1 to DRAM (sum of all channels).",
+        "EventName": "UNC_MC1_RDCAS_COUNT_FREERUN",
         "PerPkg": "1",
+        "PublicDescription": "Counts every 64B read entering the Memory Controller 1 to DRAM (sum of all channels).",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Incoming VC1 read request",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x04",
-        "EventName": "UNC_M_VC1_REQUESTS_RD",
+        "BriefDescription": "Counts every 64B write request entering the Memory Controller 1 to DRAM (sum of all channels). Each write request counts as a new request incrementing this counter. However, same cache line write requests (both full and partial) are combined to a single 64 byte data transfer to DRAM.",
+        "EventName": "UNC_MC1_WRCAS_COUNT_FREERUN",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Incoming VC1 write request",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x05",
-        "EventName": "UNC_M_VC1_REQUESTS_WR",
+        "BriefDescription": "ACT command for a read request sent to DRAM",
+        "EventCode": "0x24",
+        "EventName": "UNC_M_ACT_COUNT_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Incoming read prefetch request from IA",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x0A",
-        "EventName": "UNC_M_PREFETCH_RD",
+        "BriefDescription": "ACT command sent to DRAM",
+        "EventCode": "0x26",
+        "EventName": "UNC_M_ACT_COUNT_TOTAL",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Any Rank at Hot state",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x19",
-        "EventName": "UNC_M_DRAM_THERMAL_HOT",
+        "BriefDescription": "ACT command for a write request sent to DRAM",
+        "EventCode": "0x25",
+        "EventName": "UNC_M_ACT_COUNT_WR",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Any Rank at Warm state",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x1A",
-        "EventName": "UNC_M_DRAM_THERMAL_WARM",
+        "BriefDescription": "Read CAS command sent to DRAM",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_CAS_COUNT_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "incoming read request page status is Page Hit",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x1C",
-        "EventName": "UNC_M_DRAM_PAGE_HIT_RD",
+        "BriefDescription": "Write CAS command sent to DRAM",
+        "EventCode": "0x23",
+        "EventName": "UNC_M_CAS_COUNT_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Number of clocks",
+        "EventCode": "0x01",
+        "EventName": "UNC_M_CLOCKTICKS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
         "BriefDescription": "incoming read request page status is Page Empty",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
         "EventCode": "0x1D",
         "EventName": "UNC_M_DRAM_PAGE_EMPTY_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "incoming read request page status is Page Miss",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x1E",
-        "EventName": "UNC_M_DRAM_PAGE_MISS_RD",
+        "BriefDescription": "incoming write request page status is Page Empty",
+        "EventCode": "0x20",
+        "EventName": "UNC_M_DRAM_PAGE_EMPTY_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming read request page status is Page Hit",
+        "EventCode": "0x1C",
+        "EventName": "UNC_M_DRAM_PAGE_HIT_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
         "BriefDescription": "incoming write request page status is Page Hit",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
         "EventCode": "0x1F",
         "EventName": "UNC_M_DRAM_PAGE_HIT_WR",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "incoming write request page status is Page Empty",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x20",
-        "EventName": "UNC_M_DRAM_PAGE_EMPTY_WR",
+        "BriefDescription": "incoming read request page status is Page Miss",
+        "EventCode": "0x1E",
+        "EventName": "UNC_M_DRAM_PAGE_MISS_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
         "BriefDescription": "incoming write request page status is Page Miss",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
         "EventCode": "0x21",
         "EventName": "UNC_M_DRAM_PAGE_MISS_WR",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Read CAS command sent to DRAM",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x22",
-        "EventName": "UNC_M_CAS_COUNT_RD",
-        "PerPkg": "1",
-        "Unit": "iMC"
-    },
-    {
-        "BriefDescription": "Write CAS command sent to DRAM",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x23",
-        "EventName": "UNC_M_CAS_COUNT_WR",
+        "BriefDescription": "Any Rank at Hot state",
+        "EventCode": "0x19",
+        "EventName": "UNC_M_DRAM_THERMAL_HOT",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "ACT command for a read request sent to DRAM",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x24",
-        "EventName": "UNC_M_ACT_COUNT_RD",
+        "BriefDescription": "Any Rank at Warm state",
+        "EventCode": "0x1A",
+        "EventName": "UNC_M_DRAM_THERMAL_WARM",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "ACT command for a write request sent to DRAM",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x25",
-        "EventName": "UNC_M_ACT_COUNT_WR",
+        "BriefDescription": "Incoming read prefetch request from IA.",
+        "EventCode": "0x0A",
+        "EventName": "UNC_M_PREFETCH_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "ACT command sent to DRAM",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x26",
-        "EventName": "UNC_M_ACT_COUNT_TOTAL",
+        "BriefDescription": "PRE command sent to DRAM due to page table idle timer expiration",
+        "EventCode": "0x28",
+        "EventName": "UNC_M_PRE_COUNT_IDLE",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
         "BriefDescription": "PRE command sent to DRAM for a read/write request",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
         "EventCode": "0x27",
         "EventName": "UNC_M_PRE_COUNT_PAGE_MISS",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "PRE command sent to DRAM due to page table idle timer expiration",
-        "Counter": "0,1,2,3,4",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x28",
-        "EventName": "UNC_M_PRE_COUNT_IDLE",
-        "PerPkg": "1",
-        "Unit": "iMC"
-    },
-    {
-        "BriefDescription": "Counts every 64B read  request entering the Memory Controller 0 to DRAM (sum of all channels)",
-        "CounterType": "FREERUN",
-        "EventName": "UNC_MC0_RDCAS_COUNT_FREERUN",
+        "BriefDescription": "Incoming VC0 read request",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_VC0_REQUESTS_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Counts every 64B read request entering the Memory Controller 1 to DRAM (sum of all channels)",
-        "Counter": "3",
-        "CounterType": "FREERUN",
-        "EventName": "UNC_MC1_RDCAS_COUNT_FREERUN",
+        "BriefDescription": "Incoming VC0 write request",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_VC0_REQUESTS_WR",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Counts every 64B write request entering the Memory Controller 0 to DRAM (sum of all channels). Each write request counts as a new request incrementing this counter. However, same cache line write requests (both full and partial) are combined to a single 64 byte data transfer to DRAM",
-        "Counter": "1",
-        "CounterType": "FREERUN",
-        "EventName": "UNC_MC0_WRCAS_COUNT_FREERUN",
+        "BriefDescription": "Incoming VC1 read request",
+        "EventCode": "0x04",
+        "EventName": "UNC_M_VC1_REQUESTS_RD",
         "PerPkg": "1",
         "Unit": "iMC"
     },
     {
-        "BriefDescription": "Counts every 64B write request entering the Memory Controller 1 to DRAM (sum of all channels). Each write request counts as a new request incrementing this counter. However, same cache line write requests (both full and partial) are combined to a single 64 byte data transfer to DRAM",
-        "Counter": "4",
-        "CounterType": "FREERUN",
-        "EventName": "UNC_MC1_WRCAS_COUNT_FREERUN",
+        "BriefDescription": "Incoming VC1 write request",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_VC1_REQUESTS_WR",
         "PerPkg": "1",
         "Unit": "iMC"
     }
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/uncore-other.json b/tools/perf/pmu-events/arch/x86/alderlake/uncore-other.json
index b1ae349f5f21..bc5fb6b76065 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/uncore-other.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/uncore-other.json
@@ -1,40 +1,73 @@
 [
     {
-        "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles",
-        "Counter": "Fixed",
-        "CounterType": "PGMABLE",
-        "EventCode": "0xff",
-        "EventName": "UNC_CLOCK.SOCKET",
+        "BriefDescription": "Number of requests allocated in Coherency Tracker.",
+        "EventCode": "0x84",
+        "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
         "PerPkg": "1",
-        "Unit": "CLOCK"
+        "UMask": "0x1",
+        "Unit": "ARB"
     },
     {
-        "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC",
-        "Counter": "0,1",
-        "CounterType": "PGMABLE",
+        "BriefDescription": "Each cycle counts number of any coherent request at memory controller that were issued by any core.",
+        "EventCode": "0x85",
+        "EventName": "UNC_ARB_DAT_OCCUPANCY.ALL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "Each cycle counts number of coherent reads pending on data return from memory controller that were issued by any core.",
+        "EventCode": "0x85",
+        "EventName": "UNC_ARB_DAT_OCCUPANCY.RD",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "Number of coherent read requests sent to memory controller that were issued by any core.",
         "EventCode": "0x81",
-        "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+        "EventName": "UNC_ARB_DAT_REQUESTS.RD",
         "PerPkg": "1",
-        "UMask": "0x01",
+        "UMask": "0x2",
         "Unit": "ARB"
     },
     {
-        "BriefDescription": "Number of requests allocated in Coherency Tracker",
-        "Counter": "0,1",
-        "CounterType": "PGMABLE",
-        "EventCode": "0x84",
-        "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+        "BriefDescription": "This event is deprecated. Refer to new event UNC_ARB_DAT_OCCUPANCY.ALL",
+        "EventCode": "0x85",
+        "EventName": "UNC_ARB_IFA_OCCUPANCY.ALL",
         "PerPkg": "1",
-        "UMask": "0x01",
+        "UMask": "0x1",
         "Unit": "ARB"
     },
     {
-        "BriefDescription": "Each cycle counts number of all outgoing valid entries in ReqTrk. Such entry is defined as valid from its allocation in ReqTrk till deallocation. Accounts for Coherent and non-coherent traffic",
-        "CounterType": "PGMABLE",
+        "BriefDescription": "This event is deprecated. Refer to new event UNC_ARB_DAT_REQUESTS.RD",
+        "EventCode": "0x81",
+        "EventName": "UNC_ARB_REQ_TRK_REQUEST.DRD",
+        "PerPkg": "1",
+        "UMask": "0x2",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "Each cycle counts number of all outgoing valid entries in ReqTrk. Such entry is defined as valid from its allocation in ReqTrk till deallocation. Accounts for Coherent and non-coherent traffic.",
         "EventCode": "0x80",
         "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
         "PerPkg": "1",
-        "UMask": "0x01",
+        "UMask": "0x1",
         "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+        "EventCode": "0x81",
+        "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+        "EventCode": "0xff",
+        "EventName": "UNC_CLOCK.SOCKET",
+        "PerPkg": "1",
+        "Unit": "CLOCK"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json
index 12baf768ad8d..3827d292da80 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json
@@ -1,317 +1,236 @@
 [
     {
-        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to any page size.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x08",
-        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0xe",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to any page size.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x49",
-        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "UMask": "0xe",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x85",
-        "EventName": "ITLB_MISSES.MISS_CAUSED_WALK",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x1",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of page walks due to an instruction fetch that miss the PDE (Page Directory Entry) cache.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x85",
-        "EventName": "ITLB_MISSES.PDE_CACHE_MISS",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "2000003",
-        "Speculative": "1",
-        "UMask": "0x80",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x85",
-        "EventName": "ITLB_MISSES.WALK_COMPLETED",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "200003",
-        "Speculative": "1",
-        "UMask": "0xe",
-        "Unit": "cpu_atom"
-    },
-    {
-        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3,4,5",
-        "EventCode": "0x05",
-        "EventName": "LD_HEAD.DTLB_MISS_AT_RET",
-        "PEBScounters": "0,1,2,3,4,5",
-        "SampleAfterValue": "1000003",
-        "Speculative": "1",
-        "UMask": "0x90",
-        "Unit": "cpu_atom"
-    },
-    {
         "BriefDescription": "Loads that miss the DTLB and hit the STLB.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.STLB_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles when at least one PMH is busy with a page walk for a demand load.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to any page size.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0xe",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Load miss in all TLB levels causes a page walk that completes. (All page sizes)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0xe",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Page walks completed due to a demand data load to a 1G page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Page walks completed due to a demand data load to a 2M/4M page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Page walks completed due to a demand data load to a 4K page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of page walks outstanding for a demand load in the PMH each cycle.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x12",
         "EventName": "DTLB_LOAD_MISSES.WALK_PENDING",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Stores that miss the DTLB and hit the STLB.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.STLB_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles when at least one PMH is busy with a page walk for a store.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to any page size.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0xe",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Store misses in all TLB levels causes a page walk that completes. (All page sizes)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0xe",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Page walks completed due to a demand data store to a 1G page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Page walks completed due to a demand data store to a 2M/4M page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Page walks completed due to a demand data store to a 4K page.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks  (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of page walks outstanding for a store in the PMH each cycle.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x13",
         "EventName": "DTLB_STORE_MISSES.WALK_PENDING",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.MISS_CAUSED_WALK",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks due to an instruction fetch that miss the PDE (Page Directory Entry) cache.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.PDE_CACHE_MISS",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x80",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Instruction fetch requests that miss the ITLB and hit the STLB.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.STLB_HIT",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "CounterMask": "1",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.WALK_ACTIVE",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
     {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size.  Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0xe",
+        "Unit": "cpu_atom"
+    },
+    {
         "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (All page sizes)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.WALK_COMPLETED",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0xe",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (2M/4M)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (4K)",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
     {
         "BriefDescription": "Number of page walks outstanding for an outstanding code request in the PMH each cycle.",
-        "CollectPEBSRecord": "2",
-        "Counter": "0,1,2,3",
         "EventCode": "0x11",
         "EventName": "ITLB_MISSES.WALK_PENDING",
-        "PEBScounters": "0,1,2,3",
+        "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.",
         "SampleAfterValue": "100003",
-        "Speculative": "1",
         "UMask": "0x10",
         "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.DTLB_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x90",
+        "Unit": "cpu_atom"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json
new file mode 100644
index 000000000000..c57e9f325fb0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json
@@ -0,0 +1,583 @@
+[
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to frontend stalls.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.ALL / SLOTS",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "tma_frontend_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / SLOTS",
+        "MetricGroup": "TopdownL2;tma_frontend_bound_group",
+        "MetricName": "tma_frontend_latency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to instruction cache misses.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+        "MetricName": "tma_icache",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+        "MetricName": "tma_itlb",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend",
+        "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+        "MetricName": "tma_branch_detect",
+        "PublicDescription": "Counts the number of issue slots  that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+        "MetricName": "tma_branch_resteer",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / SLOTS",
+        "MetricGroup": "TopdownL2;tma_frontend_bound_group",
+        "MetricName": "tma_frontend_bandwidth",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to the microcode sequencer (MS).",
+        "MetricExpr": "TOPDOWN_FE_BOUND.CISC / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+        "MetricName": "tma_cisc",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to decode stalls.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+        "MetricName": "tma_decode",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to wrong predecodes.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+        "MetricName": "tma_predecode",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not delivered by the frontend due to other common frontend stalls not categorized.",
+        "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / SLOTS",
+        "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+        "MetricName": "tma_other_fb",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear",
+        "MetricExpr": "(SLOTS - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / SLOTS",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "tma_bad_speculation",
+        "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to branch mispredicts.",
+        "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / SLOTS",
+        "MetricGroup": "TopdownL2;tma_bad_speculation_group",
+        "MetricName": "tma_branch_mispredicts",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+        "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / SLOTS",
+        "MetricGroup": "TopdownL2;tma_bad_speculation_group",
+        "MetricName": "tma_machine_clears",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to a machine clear (slow nuke).",
+        "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / SLOTS",
+        "MetricGroup": "TopdownL3;tma_machine_clears_group",
+        "MetricName": "tma_nuke",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to SMC. ",
+        "MetricExpr": "tma_nuke * (MACHINE_CLEARS.SMC / MACHINE_CLEARS.SLOW)",
+        "MetricGroup": "TopdownL4;tma_nuke_group",
+        "MetricName": "tma_smc",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory ordering. ",
+        "MetricExpr": "tma_nuke * (MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.SLOW)",
+        "MetricGroup": "TopdownL4;tma_nuke_group",
+        "MetricName": "tma_memory_ordering",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to FP assists. ",
+        "MetricExpr": "tma_nuke * (MACHINE_CLEARS.FP_ASSIST / MACHINE_CLEARS.SLOW)",
+        "MetricGroup": "TopdownL4;tma_nuke_group",
+        "MetricName": "tma_fp_assist",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory disambiguation. ",
+        "MetricExpr": "tma_nuke * (MACHINE_CLEARS.DISAMBIGUATION / MACHINE_CLEARS.SLOW)",
+        "MetricGroup": "TopdownL4;tma_nuke_group",
+        "MetricName": "tma_disambiguation",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to page faults. ",
+        "MetricExpr": "tma_nuke * (MACHINE_CLEARS.PAGE_FAULT / MACHINE_CLEARS.SLOW)",
+        "MetricGroup": "TopdownL4;tma_nuke_group",
+        "MetricName": "tma_page_fault",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to a machine clear classified as a fast nuke due to memory ordering, memory disambiguation and memory renaming.",
+        "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / SLOTS",
+        "MetricGroup": "TopdownL3;tma_machine_clears_group",
+        "MetricName": "tma_fast_nuke",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls",
+        "MetricExpr": "TOPDOWN_BE_BOUND.ALL / SLOTS",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "tma_backend_bound",
+        "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that uops must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.   The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound.   The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles due to backend bound stalls that are core execution bound and not attributed to outstanding demand load or store stalls. ",
+        "MetricExpr": "max(0, tma_backend_bound - tma_load_store_bound)",
+        "MetricGroup": "TopdownL2;tma_backend_bound_group",
+        "MetricName": "tma_core_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads. ",
+        "MetricExpr": "min((TOPDOWN_BE_BOUND.ALL / SLOTS), (LD_HEAD.ANY_AT_RET / CLKS) + tma_store_bound)",
+        "MetricGroup": "TopdownL2;tma_backend_bound_group",
+        "MetricName": "tma_load_store_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full.",
+        "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)",
+        "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+        "MetricName": "tma_store_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a load block.",
+        "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / CLKS",
+        "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+        "MetricName": "tma_l1_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.",
+        "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / CLKS",
+        "MetricGroup": "TopdownL4;tma_l1_bound_group",
+        "MetricName": "tma_store_fwd",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a first level TLB miss.",
+        "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / CLKS",
+        "MetricGroup": "TopdownL4;tma_l1_bound_group",
+        "MetricName": "tma_stlb_hit",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a second level TLB miss requiring a page walk.",
+        "MetricExpr": "LD_HEAD.PGWALK_AT_RET / CLKS",
+        "MetricGroup": "TopdownL4;tma_l1_bound_group",
+        "MetricName": "tma_stlb_miss",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a number of other load blocks.",
+        "MetricExpr": "LD_HEAD.OTHER_AT_RET / CLKS",
+        "MetricGroup": "TopdownL4;tma_l1_bound_group",
+        "MetricName": "tma_other_l1",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.",
+        "MetricExpr": "(MEM_BOUND_STALLS.LOAD_L2_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD)",
+        "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+        "MetricName": "tma_l2_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
+        "MetricExpr": "(MEM_BOUND_STALLS.LOAD_LLC_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD)",
+        "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+        "MetricName": "tma_l3_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
+        "MetricExpr": "(MEM_BOUND_STALLS.LOAD_DRAM_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD)",
+        "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+        "MetricName": "tma_dram_bound",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hits in the L2, LLC, DRAM or MMIO (Non-DRAM) but could not be correctly attributed or cycles in which the load miss is waiting on a request buffer.",
+        "MetricExpr": "max(0, tma_load_store_bound - (tma_store_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_dram_bound))",
+        "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+        "MetricName": "tma_other_load_store",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls",
+        "MetricExpr": "tma_backend_bound",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "tma_backend_bound_aux",
+        "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that UOPS must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count.  All of these subevents count backend stalls, in slots, due to a resource limitation.   These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based.  These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.  ",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls",
+        "MetricExpr": "tma_backend_bound",
+        "MetricGroup": "TopdownL2;tma_backend_bound_aux_group",
+        "MetricName": "tma_resource_bound",
+        "PublicDescription": "Counts the total number of issue slots  that were not consumed by the backend due to backend stalls.  Note that uops must be available for consumption in order for this event to count.  If a uop is not available (IQ is empty), this event will not count. ",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.",
+        "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / SLOTS",
+        "MetricGroup": "TopdownL3;tma_resource_bound_group",
+        "MetricName": "tma_mem_scheduler",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to store buffer full",
+        "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)",
+        "MetricGroup": "TopdownL4;tma_mem_scheduler_group",
+        "MetricName": "tma_st_buffer",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to load buffer full",
+        "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.LD_BUF / MEM_SCHEDULER_BLOCK.ALL",
+        "MetricGroup": "TopdownL4;tma_mem_scheduler_group",
+        "MetricName": "tma_ld_buffer",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to RSV full relative ",
+        "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.RSV / MEM_SCHEDULER_BLOCK.ALL",
+        "MetricGroup": "TopdownL4;tma_mem_scheduler_group",
+        "MetricName": "tma_rsv",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.",
+        "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / SLOTS",
+        "MetricGroup": "TopdownL3;tma_resource_bound_group",
+        "MetricName": "tma_non_mem_scheduler",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).",
+        "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / SLOTS",
+        "MetricGroup": "TopdownL3;tma_resource_bound_group",
+        "MetricName": "tma_register",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to the reorder buffer being full (ROB stalls).",
+        "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / SLOTS",
+        "MetricGroup": "TopdownL3;tma_resource_bound_group",
+        "MetricName": "tma_reorder_buffer",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to certain allocation restrictions.",
+        "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / SLOTS",
+        "MetricGroup": "TopdownL3;tma_resource_bound_group",
+        "MetricName": "tma_alloc_restriction",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots  that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).",
+        "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / SLOTS",
+        "MetricGroup": "TopdownL3;tma_resource_bound_group",
+        "MetricName": "tma_serialization",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the numer of issue slots  that result in retirement slots. ",
+        "MetricExpr": "TOPDOWN_RETIRING.ALL / SLOTS",
+        "MetricGroup": "TopdownL1",
+        "MetricName": "tma_retiring",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of uops that are not from the microsequencer. ",
+        "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS) / SLOTS",
+        "MetricGroup": "TopdownL2;tma_retiring_group",
+        "MetricName": "tma_base",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point operations per uop with all default weighting.",
+        "MetricExpr": "UOPS_RETIRED.FPDIV / SLOTS",
+        "MetricGroup": "TopdownL3;tma_base_group",
+        "MetricName": "tma_fp_uops",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.",
+        "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS - UOPS_RETIRED.FPDIV) / SLOTS",
+        "MetricGroup": "TopdownL3;tma_base_group",
+        "MetricName": "tma_other_ret",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)",
+        "MetricExpr": "UOPS_RETIRED.MS / SLOTS",
+        "MetricGroup": "TopdownL2;tma_retiring_group",
+        "MetricName": "tma_ms_uops",
+        "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "CPU_CLK_UNHALTED.CORE",
+        "MetricName": "CLKS"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "CPU_CLK_UNHALTED.CORE_P",
+        "MetricName": "CLKS_P"
+    },
+    {
+        "BriefDescription": "",
+        "MetricExpr": "5 * CLKS",
+        "MetricName": "SLOTS"
+    },
+    {
+        "BriefDescription": "Instructions Per Cycle",
+        "MetricExpr": "INST_RETIRED.ANY / CLKS",
+        "MetricName": "IPC"
+    },
+    {
+        "BriefDescription": "Cycles Per Instruction",
+        "MetricExpr": "CLKS / INST_RETIRED.ANY",
+        "MetricName": "CPI"
+    },
+    {
+        "BriefDescription": "Uops Per Instruction",
+        "MetricExpr": "UOPS_RETIRED.ALL / INST_RETIRED.ANY",
+        "MetricName": "UPI"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads with a store forward or unknown store address block",
+        "MetricExpr": "100 * LD_BLOCKS.DATA_UNKNOWN / MEM_UOPS_RETIRED.ALL_LOADS",
+        "MetricName": "Store_Fwd_Blocks"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads with a address aliasing block",
+        "MetricExpr": "100 * LD_BLOCKS.4K_ALIAS / MEM_UOPS_RETIRED.ALL_LOADS",
+        "MetricName": "Address_Alias_Blocks"
+    },
+    {
+        "BriefDescription": "Percentage of total non-speculative loads that are splits",
+        "MetricExpr": "100 * MEM_UOPS_RETIRED.SPLIT_LOADS / MEM_UOPS_RETIRED.ALL_LOADS",
+        "MetricName": "Load_Splits"
+    },
+    {
+        "BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
+        "MetricName": "IpBranch"
+    },
+    {
+        "BriefDescription": "Instruction per (near) call (lower number means higher occurrence rate)",
+        "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.CALL",
+        "MetricName": "IpCall"
+    },
+    {
+        "BriefDescription": "Instructions per Load",
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
+        "MetricName": "IpLoad"
+    },
+    {
+        "BriefDescription": "Instructions per Store",
+        "MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
+        "MetricName": "IpStore"
+    },
+    {
+        "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction",
+        "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+        "MetricName": "IpMispredict"
+    },
+    {
+        "BriefDescription": "Instructions per Far Branch",
+        "MetricExpr": "INST_RETIRED.ANY / (BR_INST_RETIRED.FAR_BRANCH / 2)",
+        "MetricName": "IpFarBranch"
+    },
+    {
+        "BriefDescription": "Ratio of all branches which mispredict",
+        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.ALL_BRANCHES",
+        "MetricName": "Branch_Mispredict_Ratio"
+    },
+    {
+        "BriefDescription": "Ratio between Mispredicted branches and unknown branches",
+        "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / BACLEARS.ANY",
+        "MetricName": "Branch_Mispredict_to_Unknown_Branch_Ratio"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are ucode ops",
+        "MetricExpr": "100 * UOPS_RETIRED.MS / UOPS_RETIRED.ALL",
+        "MetricName": "Microcode_Uop_Ratio"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are FPDiv uops",
+        "MetricExpr": "100 * UOPS_RETIRED.FPDIV / UOPS_RETIRED.ALL",
+        "MetricName": "FPDiv_Uop_Ratio"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are IDiv uops",
+        "MetricExpr": "100 * UOPS_RETIRED.IDIV / UOPS_RETIRED.ALL",
+        "MetricName": "IDiv_Uop_Ratio"
+    },
+    {
+        "BriefDescription": "Percentage of all uops which are x87 uops",
+        "MetricExpr": "100 * UOPS_RETIRED.X87 / UOPS_RETIRED.ALL",
+        "MetricName": "X87_Uop_Ratio"
+    },
+    {
+        "BriefDescription": "Average Frequency Utilization relative nominal frequency",
+        "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
+        "MetricName": "Turbo_Utilization"
+    },
+    {
+        "BriefDescription": "Fraction of cycles spent in Kernel mode",
+        "MetricExpr": "cpu@CPU_CLK_UNHALTED.CORE@k / CPU_CLK_UNHALTED.CORE",
+        "MetricName": "Kernel_Utilization"
+    },
+    {
+        "BriefDescription": "Average CPU Utilization",
+        "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
+        "MetricName": "CPU_Utilization"
+    },
+    {
+        "BriefDescription": "Cycle cost per L2 hit",
+        "MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT",
+        "MetricName": "Cycles_per_Demand_Load_L2_Hit"
+    },
+    {
+        "BriefDescription": "Cycle cost per LLC hit",
+        "MetricExpr": "MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_LOAD_UOPS_RETIRED.L3_HIT",
+        "MetricName": "Cycles_per_Demand_Load_L3_Hit"
+    },
+    {
+        "BriefDescription": "Cycle cost per DRAM hit",
+        "MetricExpr": "MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_LOAD_UOPS_RETIRED.DRAM_HIT",
+        "MetricName": "Cycles_per_Demand_Load_DRAM_Hit"
+    },
+    {
+        "BriefDescription": "Percent of instruction miss cost that hit in the L2",
+        "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / (MEM_BOUND_STALLS.IFETCH)",
+        "MetricName": "Inst_Miss_Cost_L2Hit_Percent"
+    },
+    {
+        "BriefDescription": "Percent of instruction miss cost that hit in the L3",
+        "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / (MEM_BOUND_STALLS.IFETCH)",
+        "MetricName": "Inst_Miss_Cost_L3Hit_Percent"
+    },
+    {
+        "BriefDescription": "Percent of instruction miss cost that hit in DRAM",
+        "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / (MEM_BOUND_STALLS.IFETCH)",
+        "MetricName": "Inst_Miss_Cost_DRAMHit_Percent"
+    },
+    {
+        "BriefDescription": "load ops retired per 1000 instruction",
+        "MetricExpr": "1000 * MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
+        "MetricName": "MemLoadPKI"
+    },
+    {
+        "BriefDescription": "C1 residency percent per core",
+        "MetricExpr": "cstate_core@c1\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C1_Core_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C6 residency percent per core",
+        "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C6_Core_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C7 residency percent per core",
+        "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C7_Core_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C2 residency percent per package",
+        "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C2_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C3 residency percent per package",
+        "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C3_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C6 residency percent per package",
+        "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C6_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C7 residency percent per package",
+        "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C7_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C8 residency percent per package",
+        "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C8_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C9 residency percent per package",
+        "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C9_Pkg_Residency",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "C10 residency percent per package",
+        "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC",
+        "MetricGroup": "Power",
+        "MetricName": "C10_Pkg_Residency",
+        "ScaleUnit": "100%"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/cache.json b/tools/perf/pmu-events/arch/x86/alderlaken/cache.json
new file mode 100644
index 000000000000..043445ae14a8
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/cache.json
@@ -0,0 +1,330 @@
+[
+    {
+        "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.",
+        "EventCode": "0x2e",
+        "EventName": "LONGEST_LAT_CACHE.MISS",
+        "PublicDescription": "Counts the number of cacheable memory requests that miss in the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x41"
+    },
+    {
+        "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.",
+        "EventCode": "0x2e",
+        "EventName": "LONGEST_LAT_CACHE.REFERENCE",
+        "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the platform has an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4f"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or translation lookaside buffer (TLB) miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
+        "SampleAfterValue": "200003",
+        "UMask": "0x38"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH_DRAM_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or translation lookaside buffer (TLB) miss which hit in DRAM or MMIO (non-DRAM).",
+        "SampleAfterValue": "200003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH_L2_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the L2 cache.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the LLC or other core with HITE/F/M.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.IFETCH_LLC_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to an instruction cache or Translation Lookaside Buffer (TLB) miss which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD_DRAM_HIT",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD_L2_HIT",
+        "SampleAfterValue": "200003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the LLC or other core with HITE/F/M.",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS.LOAD_LLC_HIT",
+        "PublicDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of load uops retired that hit in DRAM.",
+        "Data_LA": "1",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.DRAM_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of load uops retired that hit in the L2 cache.",
+        "Data_LA": "1",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of load uops retired that hit in the L3 cache.",
+        "Data_LA": "1",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ALL",
+        "SampleAfterValue": "20003",
+        "UMask": "0x7"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.RSV",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.",
+        "EventCode": "0x04",
+        "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of load uops retired.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.ALL_LOADS",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of load uops retired.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x81"
+    },
+    {
+        "BriefDescription": "Counts the number of store uops retired.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.ALL_STORES",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of store uops retired.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x82"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x80",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x10",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x100",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x20",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x4",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x200",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x40",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
+        "MSRIndex": "0x3F6",
+        "MSRValue": "0x8",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x5"
+    },
+    {
+        "BriefDescription": "Counts the number of retired split load uops.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x41"
+    },
+    {
+        "BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.",
+        "Data_LA": "1",
+        "EventCode": "0xd0",
+        "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY",
+        "PEBS": "2",
+        "PublicDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled. If PEBS is enabled and a PEBS record is generated, will populate PEBS Latency and PEBS Data Source fields accordingly.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x6"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F803C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10003C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x4003C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x8003C0001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_HIT",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F803C0002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10003C0002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to instruction cache misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ICACHE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/floating-point.json b/tools/perf/pmu-events/arch/x86/alderlaken/floating-point.json
new file mode 100644
index 000000000000..30e8ca3c1485
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/floating-point.json
@@ -0,0 +1,18 @@
+[
+    {
+        "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.FP_ASSIST",
+        "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.",
+        "SampleAfterValue": "20003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point divide uops retired (x87 and SSE, including x87 sqrt).",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.FPDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x8"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/frontend.json b/tools/perf/pmu-events/arch/x86/alderlaken/frontend.json
new file mode 100644
index 000000000000..36898bab2bba
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/frontend.json
@@ -0,0 +1,26 @@
+[
+    {
+        "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "EventCode": "0xe6",
+        "EventName": "BACLEARS.ANY",
+        "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend.  Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of requests to the instruction cache for one or more bytes of a cache line.",
+        "EventCode": "0x80",
+        "EventName": "ICACHE.ACCESSES",
+        "PublicDescription": "Counts the total number of requests to the instruction cache.  The event only counts new cache line accesses, so that multiple back to back fetches to the exact same cache line or byte chunk count as one.  Specifically, the event counts when accesses from sequential code crosses the cache line boundary, or when a branch target is moved to a new line or to a non-sequential byte chunk of the same line.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "Counts the number of instruction cache misses.",
+        "EventCode": "0x80",
+        "EventName": "ICACHE.MISSES",
+        "PublicDescription": "Counts the number of missed requests to the instruction cache.  The event only counts new cache line accesses, so that multiple back to back fetches to the exact same cache line and byte chunk count as one.  Specifically, the event counts when accesses from sequential code crosses the cache line boundary, or when a branch target is moved to a new line or to a non-sequential byte chunk of the same line.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/memory.json b/tools/perf/pmu-events/arch/x86/alderlaken/memory.json
new file mode 100644
index 000000000000..f84bf8c43495
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/memory.json
@@ -0,0 +1,81 @@
+[
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to any number of reasons, including an L1 miss, WCB full, pagewalk, store address block or store data block, on a load that retires.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.ANY_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_BOUND_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xf4"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.OTHER_AT_RET",
+        "PublicDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to other block cases such as pipeline conflicts, fences, etc.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc0"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a pagewalk.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.PGWALK_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xa0"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a store address match.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.ST_ADDR_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x84"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.MEMORY_ORDERING",
+        "SampleAfterValue": "20003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_MISS",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_MISS",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.L3_MISS_LOCAL",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x3F84400002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/other.json b/tools/perf/pmu-events/arch/x86/alderlaken/other.json
new file mode 100644
index 000000000000..6336de61f628
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/other.json
@@ -0,0 +1,38 @@
+[
+    {
+        "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.COREWB_M.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10008",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand data reads that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10001",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10002",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts streaming stores that have any type of response.",
+        "EventCode": "0xB7",
+        "EventName": "OCR.STREAMING_WR.ANY_RESPONSE",
+        "MSRIndex": "0x1a6,0x1a7",
+        "MSRValue": "0x10800",
+        "SampleAfterValue": "100003",
+        "UMask": "0x1"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
new file mode 100644
index 000000000000..fa53ff11a509
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
@@ -0,0 +1,533 @@
+[
+    {
+        "BriefDescription": "Counts the total number of branch instructions retired for all branch types.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.ALL_BRANCHES",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires.  All branch type instructions are accounted for.",
+        "SampleAfterValue": "200003"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.NEAR_CALL",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf9"
+    },
+    {
+        "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.FAR_BRANCH",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xbf"
+    },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.IND_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.COND",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.JCC",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of near CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf9"
+    },
+    {
+        "BriefDescription": "Counts the number of near RET branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NON_RETURN_IND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of near relative CALL branch instructions retired.",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.REL_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfd"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.NEAR_RETURN",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.COND_TAKEN",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.TAKEN_JCC",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of mispredicted branch instructions retired.  All branch type instructions are accounted for.  Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP.    A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path.",
+        "SampleAfterValue": "200003"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted taken JCC (Jump on Conditional Code) branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.COND_TAKEN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect JMP and near indirect CALL branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect CALL branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.INDIRECT_CALL",
+        "Deprecated": "1",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.IND_CALL",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfb"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.COND",
+        "Deprecated": "1",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.JCC",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0x7e"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.INDIRECT",
+        "Deprecated": "1",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.NON_RETURN_IND",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xeb"
+    },
+    {
+        "BriefDescription": "Counts the number of mispredicted near RET branch instructions retired.",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.RETURN",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xf7"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.COND_TAKEN",
+        "Deprecated": "1",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.TAKEN_JCC",
+        "PEBS": "1",
+        "SampleAfterValue": "200003",
+        "UMask": "0xfe"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "EventName": "CPU_CLK_UNHALTED.CORE",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "EventCode": "0x3c",
+        "EventName": "CPU_CLK_UNHALTED.CORE_P",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency. (Fixed event)",
+        "EventName": "CPU_CLK_UNHALTED.REF_TSC",
+        "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses fixed counter 2.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.",
+        "EventCode": "0x3c",
+        "EventName": "CPU_CLK_UNHALTED.REF_TSC_P",
+        "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "EventName": "CPU_CLK_UNHALTED.THREAD",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time.  This event uses fixed counter 1.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "EventCode": "0x3c",
+        "EventName": "CPU_CLK_UNHALTED.THREAD_P",
+        "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003"
+    },
+    {
+        "BriefDescription": "Counts the total number of instructions retired. (Fixed event)",
+        "EventName": "INST_RETIRED.ANY",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of instructions that retired. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. This event continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the total number of instructions retired.",
+        "EventCode": "0xc0",
+        "EventName": "INST_RETIRED.ANY_P",
+        "PEBS": "1",
+        "PublicDescription": "Counts the total number of instructions that retired. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. This event continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses a programmable general purpose performance counter.",
+        "SampleAfterValue": "2000003"
+    },
+    {
+        "BriefDescription": "This event is deprecated. Refer to new event LD_BLOCKS.ADDRESS_ALIAS",
+        "Deprecated": "1",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.4K_ALIAS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.ADDRESS_ALIAS",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.",
+        "EventCode": "0x03",
+        "EventName": "LD_BLOCKS.DATA_UNKNOWN",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.DISAMBIGUATION",
+        "SampleAfterValue": "20003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of machines clears due to memory renaming.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.MRN_NUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to a page fault.  Counts both I-Side and D-Side (Loads/Stores) page faults.  A page fault occurs when either the page is not present, or an access violation occurs.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.PAGE_FAULT",
+        "SampleAfterValue": "20003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine with the use of microcode due to SMC, MEMORY_ORDERING, FP_ASSISTS, PAGE_FAULT, DISAMBIGUATION, and FPC_VIRTUAL_TRAP.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SLOW",
+        "SampleAfterValue": "20003",
+        "UMask": "0x6f"
+    },
+    {
+        "BriefDescription": "Counts the number of machine clears due to program modifying data (self modifying code) within 1K of a recently fetched code page.",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.SMC",
+        "SampleAfterValue": "20003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires.",
+        "EventCode": "0x75",
+        "EventName": "SERIALIZATION.NON_C01_MS_SCB",
+        "PublicDescription": "Counts the number of issue slots not consumed by the backend due to a micro-sequencer (MS) scoreboard, which stalls the front-end from issuing from the UROM until a specified older uop retires. The most commonly executed instruction with an MS scoreboard is PAUSE.",
+        "SampleAfterValue": "200003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.ALL",
+        "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ) even if an FE_bound event occurs during this period. Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
+        "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to fast nukes such as memory ordering and memory disambiguation machine clears.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.FASTNUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to branch mispredicts.",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MISPREDICT",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to a machine clear (nuke).",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.NUKE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots every cycle that were not consumed by the backend due to backend stalls.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.ALL",
+        "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to certain allocation restrictions.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REGISTER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to the reorder buffer being full (ROB stalls).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.SERIALIZATION",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the total number of issue slots every cycle that were not consumed by the backend due to frontend stalls.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ALL",
+        "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BACLEARS.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_DETECT",
+        "PublicDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTCLEARS.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.BRANCH_RESTEER",
+        "PublicDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to the microcode sequencer (MS).",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.CISC",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to decode stalls.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.DECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8d"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to a latency related stalls including BACLEARs, BTCLEARs, ITLB misses, and ICache misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x72"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to ITLB misses.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.ITLB",
+        "PublicDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to other common frontend stalls not categorized.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.OTHER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not delivered by the frontend due to wrong predecodes.",
+        "EventCode": "0x71",
+        "EventName": "TOPDOWN_FE_BOUND.PREDECODE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4"
+    },
+    {
+        "BriefDescription": "Counts the total number of consumed retirement slots.",
+        "EventCode": "0xc2",
+        "EventName": "TOPDOWN_RETIRING.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "1000003"
+    },
+    {
+        "BriefDescription": "Counts the total number of uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.ALL",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003"
+    },
+    {
+        "BriefDescription": "Counts the number of integer divide uops retired.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.IDIV",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of uops that are from complex flows issued by the micro-sequencer (MS).",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.MS",
+        "PEBS": "1",
+        "PublicDescription": "Counts the number of uops that are from complex flows issued by the Microcode Sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of x87 uops retired, includes those in MS flows.",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.X87",
+        "PEBS": "1",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x2"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/uncore-memory.json b/tools/perf/pmu-events/arch/x86/alderlaken/uncore-memory.json
new file mode 100644
index 000000000000..2ccd9cf96957
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/uncore-memory.json
@@ -0,0 +1,175 @@
+[
+    {
+        "BriefDescription": "Counts every 64B read  request entering the Memory Controller 0 to DRAM (sum of all channels).",
+        "EventName": "UNC_MC0_RDCAS_COUNT_FREERUN",
+        "PerPkg": "1",
+        "PublicDescription": "Counts every 64B read request entering the Memory Controller 0 to DRAM (sum of all channels).",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Counts every 64B write request entering the Memory Controller 0 to DRAM (sum of all channels). Each write request counts as a new request incrementing this counter. However, same cache line write requests (both full and partial) are combined to a single 64 byte data transfer to DRAM.",
+        "EventName": "UNC_MC0_WRCAS_COUNT_FREERUN",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Counts every 64B read request entering the Memory Controller 1 to DRAM (sum of all channels).",
+        "EventName": "UNC_MC1_RDCAS_COUNT_FREERUN",
+        "PerPkg": "1",
+        "PublicDescription": "Counts every 64B read entering the Memory Controller 1 to DRAM (sum of all channels).",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Counts every 64B write request entering the Memory Controller 1 to DRAM (sum of all channels). Each write request counts as a new request incrementing this counter. However, same cache line write requests (both full and partial) are combined to a single 64 byte data transfer to DRAM.",
+        "EventName": "UNC_MC1_WRCAS_COUNT_FREERUN",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "ACT command for a read request sent to DRAM",
+        "EventCode": "0x24",
+        "EventName": "UNC_M_ACT_COUNT_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "ACT command sent to DRAM",
+        "EventCode": "0x26",
+        "EventName": "UNC_M_ACT_COUNT_TOTAL",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "ACT command for a write request sent to DRAM",
+        "EventCode": "0x25",
+        "EventName": "UNC_M_ACT_COUNT_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Read CAS command sent to DRAM",
+        "EventCode": "0x22",
+        "EventName": "UNC_M_CAS_COUNT_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Write CAS command sent to DRAM",
+        "EventCode": "0x23",
+        "EventName": "UNC_M_CAS_COUNT_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Number of clocks",
+        "EventCode": "0x01",
+        "EventName": "UNC_M_CLOCKTICKS",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming read request page status is Page Empty",
+        "EventCode": "0x1D",
+        "EventName": "UNC_M_DRAM_PAGE_EMPTY_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming write request page status is Page Empty",
+        "EventCode": "0x20",
+        "EventName": "UNC_M_DRAM_PAGE_EMPTY_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming read request page status is Page Hit",
+        "EventCode": "0x1C",
+        "EventName": "UNC_M_DRAM_PAGE_HIT_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming write request page status is Page Hit",
+        "EventCode": "0x1F",
+        "EventName": "UNC_M_DRAM_PAGE_HIT_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming read request page status is Page Miss",
+        "EventCode": "0x1E",
+        "EventName": "UNC_M_DRAM_PAGE_MISS_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "incoming write request page status is Page Miss",
+        "EventCode": "0x21",
+        "EventName": "UNC_M_DRAM_PAGE_MISS_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Any Rank at Hot state",
+        "EventCode": "0x19",
+        "EventName": "UNC_M_DRAM_THERMAL_HOT",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Any Rank at Warm state",
+        "EventCode": "0x1A",
+        "EventName": "UNC_M_DRAM_THERMAL_WARM",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Incoming read prefetch request from IA.",
+        "EventCode": "0x0A",
+        "EventName": "UNC_M_PREFETCH_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "PRE command sent to DRAM due to page table idle timer expiration",
+        "EventCode": "0x28",
+        "EventName": "UNC_M_PRE_COUNT_IDLE",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "PRE command sent to DRAM for a read/write request",
+        "EventCode": "0x27",
+        "EventName": "UNC_M_PRE_COUNT_PAGE_MISS",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Incoming VC0 read request",
+        "EventCode": "0x02",
+        "EventName": "UNC_M_VC0_REQUESTS_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Incoming VC0 write request",
+        "EventCode": "0x03",
+        "EventName": "UNC_M_VC0_REQUESTS_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Incoming VC1 read request",
+        "EventCode": "0x04",
+        "EventName": "UNC_M_VC1_REQUESTS_RD",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    },
+    {
+        "BriefDescription": "Incoming VC1 write request",
+        "EventCode": "0x05",
+        "EventName": "UNC_M_VC1_REQUESTS_WR",
+        "PerPkg": "1",
+        "Unit": "iMC"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/uncore-other.json b/tools/perf/pmu-events/arch/x86/alderlaken/uncore-other.json
new file mode 100644
index 000000000000..f9e7777cd2be
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/uncore-other.json
@@ -0,0 +1,33 @@
+[
+    {
+        "BriefDescription": "Number of requests allocated in Coherency Tracker.",
+        "EventCode": "0x84",
+        "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "Each cycle counts number of all outgoing valid entries in ReqTrk. Such entry is defined as valid from its allocation in ReqTrk till deallocation. Accounts for Coherent and non-coherent traffic.",
+        "EventCode": "0x80",
+        "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+        "EventCode": "0x81",
+        "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+        "PerPkg": "1",
+        "UMask": "0x1",
+        "Unit": "ARB"
+    },
+    {
+        "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+        "EventCode": "0xff",
+        "EventName": "UNC_CLOCK.SOCKET",
+        "PerPkg": "1",
+        "Unit": "CLOCK"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/virtual-memory.json b/tools/perf/pmu-events/arch/x86/alderlaken/virtual-memory.json
new file mode 100644
index 000000000000..67fd640f790e
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/virtual-memory.json
@@ -0,0 +1,47 @@
+[
+    {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to any page size.",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0xe"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to any page size.",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size.  Includes page walks that page fault.",
+        "SampleAfterValue": "2000003",
+        "UMask": "0xe"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.MISS_CAUSED_WALK",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks due to an instruction fetch that miss the PDE (Page Directory Entry) cache.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.PDE_CACHE_MISS",
+        "SampleAfterValue": "2000003",
+        "UMask": "0x80"
+    },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size.  Includes page walks that page fault.",
+        "SampleAfterValue": "200003",
+        "UMask": "0xe"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DTLB miss.",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.DTLB_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x90"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 5e609b876790..df47462a125f 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -1,5 +1,6 @@
 Family-model,Version,Filename,EventType
-GenuineIntel-6-(97|9A|B7|BA|BE|BF),v1.15,alderlake,core
+GenuineIntel-6-(97|9A|B7|BA|BF),v1.16,alderlake,core
+GenuineIntel-6-BE,v1.16,alderlaken,core
 GenuineIntel-6-(1C|26|27|35|36),v4,bonnell,core
 GenuineIntel-6-(3D|47),v26,broadwell,core
 GenuineIntel-6-56,v23,broadwellde,core
diff --git a/tools/perf/pmu-events/jevents.py b/tools/perf/pmu-events/jevents.py
index 0daa3e007528..4c398e0eeb2f 100755
--- a/tools/perf/pmu-events/jevents.py
+++ b/tools/perf/pmu-events/jevents.py
@@ -4,6 +4,7 @@
 import argparse
 import csv
 import json
+import metric
 import os
 import sys
 from typing import (Callable, Dict, Optional, Sequence, Set, Tuple)
@@ -268,9 +269,10 @@ class JsonEvent:
     self.metric_name = jd.get('MetricName')
     self.metric_group = jd.get('MetricGroup')
     self.metric_constraint = jd.get('MetricConstraint')
-    self.metric_expr = jd.get('MetricExpr')
-    if self.metric_expr:
-      self.metric_expr = self.metric_expr.replace('\\', '\\\\')
+    self.metric_expr = None
+    if 'MetricExpr' in jd:
+       self.metric_expr = metric.ParsePerfJson(jd['MetricExpr']).Simplify()
+
     arch_std = jd.get('ArchStdEvent')
     if precise and self.desc and '(Precise Event)' not in self.desc:
       extra_desc += ' (Must be precise)' if precise == '2' else (' (Precise '
@@ -322,6 +324,10 @@ class JsonEvent:
     s = ''
     for attr in _json_event_attributes:
       x = getattr(self, attr)
+      if x and attr == 'metric_expr':
+        # Convert parsed metric expressions into a string. Slashes
+        # must be doubled in the file.
+        x = x.ToPerfJson().replace('\\', '\\\\')
       s += f'{x}\\000' if x else '\\000'
     return s
 
diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
new file mode 100644
index 000000000000..4797ed4fd817
--- /dev/null
+++ b/tools/perf/pmu-events/metric.py
@@ -0,0 +1,502 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+"""Parse or generate representations of perf metrics."""
+import ast
+import decimal
+import json
+import re
+from typing import Dict, List, Optional, Set, Union
+
+
+class Expression:
+  """Abstract base class of elements in a metric expression."""
+
+  def ToPerfJson(self) -> str:
+    """Returns a perf json file encoded representation."""
+    raise NotImplementedError()
+
+  def ToPython(self) -> str:
+    """Returns a python expr parseable representation."""
+    raise NotImplementedError()
+
+  def Simplify(self):
+    """Returns a simplified version of self."""
+    raise NotImplementedError()
+
+  def Equals(self, other) -> bool:
+    """Returns true when two expressions are the same."""
+    raise NotImplementedError()
+
+  def __str__(self) -> str:
+    return self.ToPerfJson()
+
+  def __or__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('|', self, other)
+
+  def __ror__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('|', other, self)
+
+  def __xor__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('^', self, other)
+
+  def __and__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('&', self, other)
+
+  def __lt__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('<', self, other)
+
+  def __gt__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('>', self, other)
+
+  def __add__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('+', self, other)
+
+  def __radd__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('+', other, self)
+
+  def __sub__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('-', self, other)
+
+  def __rsub__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('-', other, self)
+
+  def __mul__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('*', self, other)
+
+  def __rmul__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('*', other, self)
+
+  def __truediv__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('/', self, other)
+
+  def __rtruediv__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('/', other, self)
+
+  def __mod__(self, other: Union[int, float, 'Expression']) -> 'Operator':
+    return Operator('%', self, other)
+
+
+def _Constify(val: Union[bool, int, float, Expression]) -> Expression:
+  """Used to ensure that the nodes in the expression tree are all Expression."""
+  if isinstance(val, bool):
+    return Constant(1 if val else 0)
+  if isinstance(val, (int, float)):
+    return Constant(val)
+  return val
+
+
+# Simple lookup for operator precedence, used to avoid unnecessary
+# brackets. Precedence matches that of python and the simple expression parser.
+_PRECEDENCE = {
+    '|': 0,
+    '^': 1,
+    '&': 2,
+    '<': 3,
+    '>': 3,
+    '+': 4,
+    '-': 4,
+    '*': 5,
+    '/': 5,
+    '%': 5,
+}
+
+
+class Operator(Expression):
+  """Represents a binary operator in the parse tree."""
+
+  def __init__(self, operator: str, lhs: Union[int, float, Expression],
+               rhs: Union[int, float, Expression]):
+    self.operator = operator
+    self.lhs = _Constify(lhs)
+    self.rhs = _Constify(rhs)
+
+  def Bracket(self,
+              other: Expression,
+              other_str: str,
+              rhs: bool = False) -> str:
+    """If necessary brackets the given other value.
+
+    If ``other`` is an operator then a bracket is necessary when
+    this/self operator has higher precedence. Consider: '(a + b) * c',
+    ``other_str`` will be 'a + b'. A bracket is necessary as without
+    the bracket 'a + b * c' will evaluate 'b * c' first. However, '(a
+    * b) + c' doesn't need a bracket as 'a * b' will always be
+    evaluated first. For 'a / (b * c)' (ie the same precedence level
+    operations) then we add the bracket to best match the original
+    input, but not for '(a / b) * c' where the bracket is unnecessary.
+
+    Args:
+      other (Expression): is a lhs or rhs operator
+      other_str (str): ``other`` in the appropriate string form
+      rhs (bool):  is ``other`` on the RHS
+
+    Returns:
+      str: possibly bracketed other_str
+    """
+    if isinstance(other, Operator):
+      if _PRECEDENCE.get(self.operator, -1) > _PRECEDENCE.get(
+          other.operator, -1):
+        return f'({other_str})'
+      if rhs and _PRECEDENCE.get(self.operator, -1) == _PRECEDENCE.get(
+          other.operator, -1):
+        return f'({other_str})'
+    return other_str
+
+  def ToPerfJson(self):
+    return (f'{self.Bracket(self.lhs, self.lhs.ToPerfJson())} {self.operator} '
+            f'{self.Bracket(self.rhs, self.rhs.ToPerfJson(), True)}')
+
+  def ToPython(self):
+    return (f'{self.Bracket(self.lhs, self.lhs.ToPython())} {self.operator} '
+            f'{self.Bracket(self.rhs, self.rhs.ToPython(), True)}')
+
+  def Simplify(self) -> Expression:
+    lhs = self.lhs.Simplify()
+    rhs = self.rhs.Simplify()
+    if isinstance(lhs, Constant) and isinstance(rhs, Constant):
+      return Constant(ast.literal_eval(lhs + self.operator + rhs))
+
+    if isinstance(self.lhs, Constant):
+      if self.operator in ('+', '|') and lhs.value == '0':
+        return rhs
+
+      # Simplify multiplication by 0 except for the slot event which
+      # is deliberately introduced using this pattern.
+      if self.operator == '*' and lhs.value == '0' and (
+          not isinstance(rhs, Event) or 'slots' not in rhs.name.lower()):
+        return Constant(0)
+
+      if self.operator == '*' and lhs.value == '1':
+        return rhs
+
+    if isinstance(rhs, Constant):
+      if self.operator in ('+', '|') and rhs.value == '0':
+        return lhs
+
+      if self.operator == '*' and rhs.value == '0':
+        return Constant(0)
+
+      if self.operator == '*' and self.rhs.value == '1':
+        return lhs
+
+    return Operator(self.operator, lhs, rhs)
+
+  def Equals(self, other: Expression) -> bool:
+    if isinstance(other, Operator):
+      return self.operator == other.operator and self.lhs.Equals(
+          other.lhs) and self.rhs.Equals(other.rhs)
+    return False
+
+
+class Select(Expression):
+  """Represents a select ternary in the parse tree."""
+
+  def __init__(self, true_val: Union[int, float, Expression],
+               cond: Union[int, float, Expression],
+               false_val: Union[int, float, Expression]):
+    self.true_val = _Constify(true_val)
+    self.cond = _Constify(cond)
+    self.false_val = _Constify(false_val)
+
+  def ToPerfJson(self):
+    true_str = self.true_val.ToPerfJson()
+    cond_str = self.cond.ToPerfJson()
+    false_str = self.false_val.ToPerfJson()
+    return f'({true_str} if {cond_str} else {false_str})'
+
+  def ToPython(self):
+    return (f'Select({self.true_val.ToPython()}, {self.cond.ToPython()}, '
+            f'{self.false_val.ToPython()})')
+
+  def Simplify(self) -> Expression:
+    cond = self.cond.Simplify()
+    true_val = self.true_val.Simplify()
+    false_val = self.false_val.Simplify()
+    if isinstance(cond, Constant):
+      return false_val if cond.value == '0' else true_val
+
+    if true_val.Equals(false_val):
+      return true_val
+
+    return Select(true_val, cond, false_val)
+
+  def Equals(self, other: Expression) -> bool:
+    if isinstance(other, Select):
+      return self.cond.Equals(other.cond) and self.false_val.Equals(
+          other.false_val) and self.true_val.Equals(other.true_val)
+    return False
+
+
+class Function(Expression):
+  """A function in an expression like min, max, d_ratio."""
+
+  def __init__(self,
+               fn: str,
+               lhs: Union[int, float, Expression],
+               rhs: Optional[Union[int, float, Expression]] = None):
+    self.fn = fn
+    self.lhs = _Constify(lhs)
+    self.rhs = _Constify(rhs)
+
+  def ToPerfJson(self):
+    if self.rhs:
+      return f'{self.fn}({self.lhs.ToPerfJson()}, {self.rhs.ToPerfJson()})'
+    return f'{self.fn}({self.lhs.ToPerfJson()})'
+
+  def ToPython(self):
+    if self.rhs:
+      return f'{self.fn}({self.lhs.ToPython()}, {self.rhs.ToPython()})'
+    return f'{self.fn}({self.lhs.ToPython()})'
+
+  def Simplify(self) -> Expression:
+    lhs = self.lhs.Simplify()
+    rhs = self.rhs.Simplify() if self.rhs else None
+    if isinstance(lhs, Constant) and isinstance(rhs, Constant):
+      if self.fn == 'd_ratio':
+        if rhs.value == '0':
+          return Constant(0)
+        Constant(ast.literal_eval(f'{lhs} / {rhs}'))
+      return Constant(ast.literal_eval(f'{self.fn}({lhs}, {rhs})'))
+
+    return Function(self.fn, lhs, rhs)
+
+  def Equals(self, other: Expression) -> bool:
+    if isinstance(other, Function):
+      return self.fn == other.fn and self.lhs.Equals(
+          other.lhs) and self.rhs.Equals(other.rhs)
+    return False
+
+
+def _FixEscapes(s: str) -> str:
+  s = re.sub(r'([^\\]),', r'\1\\,', s)
+  return re.sub(r'([^\\])=', r'\1\\=', s)
+
+
+class Event(Expression):
+  """An event in an expression."""
+
+  def __init__(self, name: str, legacy_name: str = ''):
+    self.name = _FixEscapes(name)
+    self.legacy_name = _FixEscapes(legacy_name)
+
+  def ToPerfJson(self):
+    result = re.sub('/', '@', self.name)
+    return result
+
+  def ToPython(self):
+    return f'Event(r"{self.name}")'
+
+  def Simplify(self) -> Expression:
+    return self
+
+  def Equals(self, other: Expression) -> bool:
+    return isinstance(other, Event) and self.name == other.name
+
+
+class Constant(Expression):
+  """A constant within the expression tree."""
+
+  def __init__(self, value: Union[float, str]):
+    ctx = decimal.Context()
+    ctx.prec = 20
+    dec = ctx.create_decimal(repr(value) if isinstance(value, float) else value)
+    self.value = dec.normalize().to_eng_string()
+    self.value = self.value.replace('+', '')
+    self.value = self.value.replace('E', 'e')
+
+  def ToPerfJson(self):
+    return self.value
+
+  def ToPython(self):
+    return f'Constant({self.value})'
+
+  def Simplify(self) -> Expression:
+    return self
+
+  def Equals(self, other: Expression) -> bool:
+    return isinstance(other, Constant) and self.value == other.value
+
+
+class Literal(Expression):
+  """A runtime literal within the expression tree."""
+
+  def __init__(self, value: str):
+    self.value = value
+
+  def ToPerfJson(self):
+    return self.value
+
+  def ToPython(self):
+    return f'Literal({self.value})'
+
+  def Simplify(self) -> Expression:
+    return self
+
+  def Equals(self, other: Expression) -> bool:
+    return isinstance(other, Literal) and self.value == other.value
+
+
+def min(lhs: Union[int, float, Expression], rhs: Union[int, float,
+                                                       Expression]) -> Function:
+  # pylint: disable=redefined-builtin
+  # pylint: disable=invalid-name
+  return Function('min', lhs, rhs)
+
+
+def max(lhs: Union[int, float, Expression], rhs: Union[int, float,
+                                                       Expression]) -> Function:
+  # pylint: disable=redefined-builtin
+  # pylint: disable=invalid-name
+  return Function('max', lhs, rhs)
+
+
+def d_ratio(lhs: Union[int, float, Expression],
+            rhs: Union[int, float, Expression]) -> Function:
+  # pylint: disable=redefined-builtin
+  # pylint: disable=invalid-name
+  return Function('d_ratio', lhs, rhs)
+
+
+def source_count(event: Event) -> Function:
+  # pylint: disable=redefined-builtin
+  # pylint: disable=invalid-name
+  return Function('source_count', event)
+
+
+class Metric:
+  """An individual metric that will specifiable on the perf command line."""
+  groups: Set[str]
+  expr: Expression
+  scale_unit: str
+  constraint: bool
+
+  def __init__(self,
+               name: str,
+               description: str,
+               expr: Expression,
+               scale_unit: str,
+               constraint: bool = False):
+    self.name = name
+    self.description = description
+    self.expr = expr.Simplify()
+    # Workraound valid_only_metric hiding certain metrics based on unit.
+    scale_unit = scale_unit.replace('/sec', ' per sec')
+    if scale_unit[0].isdigit():
+      self.scale_unit = scale_unit
+    else:
+      self.scale_unit = f'1{scale_unit}'
+    self.constraint = constraint
+    self.groups = set()
+
+  def __lt__(self, other):
+    """Sort order."""
+    return self.name < other.name
+
+  def AddToMetricGroup(self, group):
+    """Callback used when being added to a MetricGroup."""
+    self.groups.add(group.name)
+
+  def Flatten(self) -> Set['Metric']:
+    """Return a leaf metric."""
+    return set([self])
+
+  def ToPerfJson(self) -> Dict[str, str]:
+    """Return as dictionary for Json generation."""
+    result = {
+        'MetricName': self.name,
+        'MetricGroup': ';'.join(sorted(self.groups)),
+        'BriefDescription': self.description,
+        'MetricExpr': self.expr.ToPerfJson(),
+        'ScaleUnit': self.scale_unit
+    }
+    if self.constraint:
+      result['MetricConstraint'] = 'NO_NMI_WATCHDOG'
+
+    return result
+
+
+class _MetricJsonEncoder(json.JSONEncoder):
+  """Special handling for Metric objects."""
+
+  def default(self, o):
+    if isinstance(o, Metric):
+      return o.ToPerfJson()
+    return json.JSONEncoder.default(self, o)
+
+
+class MetricGroup:
+  """A group of metrics.
+
+  Metric groups may be specificd on the perf command line, but within
+  the json they aren't encoded. Metrics may be in multiple groups
+  which can facilitate arrangements similar to trees.
+  """
+
+  def __init__(self, name: str, metric_list: List[Union[Metric,
+                                                        'MetricGroup']]):
+    self.name = name
+    self.metric_list = metric_list
+    for metric in metric_list:
+      metric.AddToMetricGroup(self)
+
+  def AddToMetricGroup(self, group):
+    """Callback used when a MetricGroup is added into another."""
+    for metric in self.metric_list:
+      metric.AddToMetricGroup(group)
+
+  def Flatten(self) -> Set[Metric]:
+    """Returns a set of all leaf metrics."""
+    result = set()
+    for x in self.metric_list:
+      result = result.union(x.Flatten())
+
+    return result
+
+  def ToPerfJson(self) -> str:
+    return json.dumps(sorted(self.Flatten()), indent=2, cls=_MetricJsonEncoder)
+
+  def __str__(self) -> str:
+    return self.ToPerfJson()
+
+
+class _RewriteIfExpToSelect(ast.NodeTransformer):
+
+  def visit_IfExp(self, node):
+    # pylint: disable=invalid-name
+    self.generic_visit(node)
+    call = ast.Call(
+        func=ast.Name(id='Select', ctx=ast.Load()),
+        args=[node.body, node.test, node.orelse],
+        keywords=[])
+    ast.copy_location(call, node.test)
+    return call
+
+
+def ParsePerfJson(orig: str) -> Expression:
+  """A simple json metric expression decoder.
+
+  Converts a json encoded metric expression by way of python's ast and
+  eval routine. First tokens are mapped to Event calls, then
+  accidentally converted keywords or literals are mapped to their
+  appropriate calls. Python's ast is used to match if-else that can't
+  be handled via operator overloading. Finally the ast is evaluated.
+
+  Args:
+    orig (str): String to parse.
+
+  Returns:
+    Expression: The parsed string.
+  """
+  # pylint: disable=eval-used
+  py = orig.strip()
+  py = re.sub(r'([a-zA-Z][^-+/\* \\\(\),]*(?:\\.[^-+/\* \\\(\),]*)*)',
+              r'Event(r"\1")', py)
+  py = re.sub(r'#Event\(r"([^"]*)"\)', r'Literal("#\1")', py)
+  py = re.sub(r'([0-9]+)Event\(r"(e[0-9]+)"\)', r'\1\2', py)
+  keywords = ['if', 'else', 'min', 'max', 'd_ratio', 'source_count']
+  for kw in keywords:
+    py = re.sub(rf'Event\(r"{kw}"\)', kw, py)
+
+  parsed = ast.parse(py, mode='eval')
+  _RewriteIfExpToSelect().visit(parsed)
+  parsed = ast.fix_missing_locations(parsed)
+  return _Constify(eval(compile(parsed, orig, 'eval')))
diff --git a/tools/perf/pmu-events/metric_test.py b/tools/perf/pmu-events/metric_test.py
new file mode 100644
index 000000000000..15315d0f716c
--- /dev/null
+++ b/tools/perf/pmu-events/metric_test.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import unittest
+from metric import Constant
+from metric import Event
+from metric import ParsePerfJson
+
+
+class TestMetricExpressions(unittest.TestCase):
+
+  def test_Operators(self):
+    a = Event('a')
+    b = Event('b')
+    self.assertEqual((a | b).ToPerfJson(), 'a | b')
+    self.assertEqual((a ^ b).ToPerfJson(), 'a ^ b')
+    self.assertEqual((a & b).ToPerfJson(), 'a & b')
+    self.assertEqual((a < b).ToPerfJson(), 'a < b')
+    self.assertEqual((a > b).ToPerfJson(), 'a > b')
+    self.assertEqual((a + b).ToPerfJson(), 'a + b')
+    self.assertEqual((a - b).ToPerfJson(), 'a - b')
+    self.assertEqual((a * b).ToPerfJson(), 'a * b')
+    self.assertEqual((a / b).ToPerfJson(), 'a / b')
+    self.assertEqual((a % b).ToPerfJson(), 'a % b')
+    one = Constant(1)
+    self.assertEqual((a + one).ToPerfJson(), 'a + 1')
+
+  def test_Brackets(self):
+    a = Event('a')
+    b = Event('b')
+    c = Event('c')
+    self.assertEqual((a * b + c).ToPerfJson(), 'a * b + c')
+    self.assertEqual((a + b * c).ToPerfJson(), 'a + b * c')
+    self.assertEqual(((a + a) + a).ToPerfJson(), 'a + a + a')
+    self.assertEqual(((a + b) * c).ToPerfJson(), '(a + b) * c')
+    self.assertEqual((a + (b * c)).ToPerfJson(), 'a + b * c')
+    self.assertEqual(((a / b) * c).ToPerfJson(), 'a / b * c')
+    self.assertEqual((a / (b * c)).ToPerfJson(), 'a / (b * c)')
+
+  def test_ParsePerfJson(self):
+    # Based on an example of a real metric.
+    before = '(a + b + c + d) / (2 * e)'
+    after = before
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    # Parsing should handle events with '-' in their name. Note, in
+    # the json file the '\' are doubled to '\\'.
+    before = r'topdown\-fe\-bound / topdown\-slots - 1'
+    after = before
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    # Parsing should handle escaped modifiers. Note, in the json file
+    # the '\' are doubled to '\\'.
+    before = r'arb@event\=0x81\,umask\=0x1@ + arb@event\=0x84\,umask\=0x1@'
+    after = before
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    # Parsing should handle exponents in numbers.
+    before = r'a + 1e12 + b'
+    after = before
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+  def test_IfElseTests(self):
+    # if-else needs rewriting to Select and back.
+    before = r'Event1 if #smt_on else Event2'
+    after = f'({before})'
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    before = r'Event1 if 0 else Event2'
+    after = f'({before})'
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    before = r'Event1 if 1 else Event2'
+    after = f'({before})'
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    # Ensure the select is evaluate last.
+    before = r'Event1 + 1 if Event2 < 2 else Event3 + 3'
+    after = (r'Select(Event(r"Event1") + Constant(1), Event(r"Event2") < '
+             r'Constant(2), Event(r"Event3") + Constant(3))')
+    self.assertEqual(ParsePerfJson(before).ToPython(), after)
+
+    before = r'Event1 > 1 if Event2 < 2 else Event3 > 3'
+    after = (r'Select(Event(r"Event1") > Constant(1), Event(r"Event2") < '
+             r'Constant(2), Event(r"Event3") > Constant(3))')
+    self.assertEqual(ParsePerfJson(before).ToPython(), after)
+
+    before = r'min(a + b if c > 1 else c + d, e + f)'
+    after = r'min((a + b if c > 1 else c + d), e + f)'
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+    before =3D r'a if b else c if d else e'
+    after =3D r'(a if b else (c if d else e))'
+    self.assertEqual(ParsePerfJson(before).ToPerfJson(), after)
+
+  def test_ToPython(self):
+    # pylint: disable=eval-used
+    # Based on an example of a real metric.
+    before = '(a + b + c + d) / (2 * e)'
+    py = ParsePerfJson(before).ToPython()
+    after = eval(py).ToPerfJson()
+    self.assertEqual(before, after)
+
+  def test_Simplify(self):
+    before = '1 + 2 + 3'
+    after = '6'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a + 0'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = '0 + a'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a | 0'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = '0 | a'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a * 0'
+    after = '0'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = '0 * a'
+    after = '0'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a * 1'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = '1 * a'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a if 0 else b'
+    after = 'b'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a if 1 else b'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    before = 'a if b else a'
+    after = 'a'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+    # Pattern used to add a slots event to metrics that require it.
+    before = '0 * SLOTS'
+    after = '0 * SLOTS'
+    self.assertEqual(ParsePerfJson(before).Simplify().ToPerfJson(), after)
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Build b/tools/perf/scripts/python/Perf-Trace-Util/Build
index 7d0e33ce6aba..d5fed4e42617 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Build
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Build
@@ -1,3 +1,3 @@
-perf-y += Context.o
+perf-$(CONFIG_LIBTRACEEVENT) += Context.o
 
 CFLAGS_Context.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs
diff --git a/tools/perf/scripts/python/bin/task-analyzer-record b/tools/perf/scripts/python/bin/task-analyzer-record
new file mode 100755
index 000000000000..0f6b51bb2767
--- /dev/null
+++ b/tools/perf/scripts/python/bin/task-analyzer-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e sched:sched_switch -e sched:sched_migrate_task "$@"
diff --git a/tools/perf/scripts/python/bin/task-analyzer-report b/tools/perf/scripts/python/bin/task-analyzer-report
new file mode 100755
index 000000000000..4b16a8cc40a0
--- /dev/null
+++ b/tools/perf/scripts/python/bin/task-analyzer-report
@@ -0,0 +1,3 @@
+#!/bin/bash
+# description: analyze timings of tasks
+perf script -s "$PERF_EXEC_PATH"/scripts/python/task-analyzer.py -- "$@"
diff --git a/tools/perf/scripts/python/intel-pt-events.py b/tools/perf/scripts/python/intel-pt-events.py
index 6be7fd8fd615..08862a2582f4 100644
--- a/tools/perf/scripts/python/intel-pt-events.py
+++ b/tools/perf/scripts/python/intel-pt-events.py
@@ -13,10 +13,12 @@
 
 from __future__ import print_function
 
+import io
 import os
 import sys
 import struct
 import argparse
+import contextlib
 
 from libxed import LibXED
 from ctypes import create_string_buffer, addressof
@@ -39,6 +41,11 @@ glb_src			= False
 glb_source_file_name	= None
 glb_line_number		= None
 glb_dso			= None
+glb_stash_dict		= {}
+glb_output		= None
+glb_output_pos		= 0
+glb_cpu			= -1
+glb_time		= 0
 
 def get_optional_null(perf_dict, field):
 	if field in perf_dict:
@@ -70,6 +77,7 @@ def trace_begin():
 	ap.add_argument("--insn-trace", action='store_true')
 	ap.add_argument("--src-trace", action='store_true')
 	ap.add_argument("--all-switch-events", action='store_true')
+	ap.add_argument("--interleave", type=int, nargs='?', const=4, default=0)
 	global glb_args
 	global glb_insn
 	global glb_src
@@ -94,11 +102,39 @@ def trace_begin():
 	perf_set_itrace_options(perf_script_context, itrace)
 
 def trace_end():
+	if glb_args.interleave:
+		flush_stashed_output()
 	print("End")
 
 def trace_unhandled(event_name, context, event_fields_dict):
 		print(' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())]))
 
+def stash_output():
+	global glb_stash_dict
+	global glb_output_pos
+	output_str = glb_output.getvalue()[glb_output_pos:]
+	n = len(output_str)
+	if n:
+		glb_output_pos += n
+		if glb_cpu not in glb_stash_dict:
+			glb_stash_dict[glb_cpu] = []
+		glb_stash_dict[glb_cpu].append(output_str)
+
+def flush_stashed_output():
+	global glb_stash_dict
+	while glb_stash_dict:
+		cpus = list(glb_stash_dict.keys())
+		# Output at most glb_args.interleave output strings per cpu
+		for cpu in cpus:
+			items = glb_stash_dict[cpu]
+			countdown = glb_args.interleave
+			while len(items) and countdown:
+				sys.stdout.write(items[0])
+				del items[0]
+				countdown -= 1
+			if not items:
+				del glb_stash_dict[cpu]
+
 def print_ptwrite(raw_buf):
 	data = struct.unpack_from("<IQ", raw_buf)
 	flags = data[0]
@@ -375,15 +411,40 @@ def do_process_event(param_dict):
 		print_common_start(comm, sample, name)
 		print_common_ip(param_dict, sample, symbol, dso)
 
+def interleave_events(param_dict):
+	global glb_cpu
+	global glb_time
+	global glb_output
+	global glb_output_pos
+
+	sample  = param_dict["sample"]
+	glb_cpu = sample["cpu"]
+	ts      = sample["time"]
+
+	if glb_time != ts:
+		glb_time = ts
+		flush_stashed_output()
+
+	glb_output_pos = 0
+	with contextlib.redirect_stdout(io.StringIO()) as glb_output:
+		do_process_event(param_dict)
+
+	stash_output()
+
 def process_event(param_dict):
 	try:
-		do_process_event(param_dict)
+		if glb_args.interleave:
+			interleave_events(param_dict)
+		else:
+			do_process_event(param_dict)
 	except broken_pipe_exception:
 		# Stop python printing broken pipe errors and traceback
 		sys.stdout = open(os.devnull, 'w')
 		sys.exit(1)
 
 def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x):
+	if glb_args.interleave:
+		flush_stashed_output()
 	if len(x) >= 2 and x[0]:
 		machine_pid = x[0]
 		vcpu = x[1]
@@ -403,6 +464,8 @@ def auxtrace_error(typ, code, cpu, pid, tid, ip, ts, msg, cpumode, *x):
 		sys.exit(1)
 
 def context_switch(ts, cpu, pid, tid, np_pid, np_tid, machine_pid, out, out_preempt, *x):
+	if glb_args.interleave:
+		flush_stashed_output()
 	if out:
 		out_str = "Switch out "
 	else:
diff --git a/tools/perf/scripts/python/task-analyzer.py b/tools/perf/scripts/python/task-analyzer.py
new file mode 100755
index 000000000000..52e8dae9b1f0
--- /dev/null
+++ b/tools/perf/scripts/python/task-analyzer.py
@@ -0,0 +1,934 @@
+# task-analyzer.py - comprehensive perf tasks analysis
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022, Hagen Paul Pfeifer <hagen@jauu.net>
+# Licensed under the terms of the GNU GPL License version 2
+#
+# Usage:
+#
+#     perf record -e sched:sched_switch -a -- sleep 10
+#     perf script report task-analyzer
+#
+
+from __future__ import print_function
+import sys
+import os
+import string
+import argparse
+import decimal
+
+
+sys.path.append(
+    os.environ["PERF_EXEC_PATH"] + "/scripts/python/Perf-Trace-Util/lib/Perf/Trace"
+)
+from perf_trace_context import *
+from Core import *
+
+# Definition of possible ASCII color codes
+_COLORS = {
+    "grey": "\033[90m",
+    "red": "\033[91m",
+    "green": "\033[92m",
+    "yellow": "\033[93m",
+    "blue": "\033[94m",
+    "violet": "\033[95m",
+    "reset": "\033[0m",
+}
+
+# Columns will have a static size to align everything properly
+# Support of 116 days of active update with nano precision
+LEN_SWITCHED_IN = len("9999999.999999999")  # 17
+LEN_SWITCHED_OUT = len("9999999.999999999")  # 17
+LEN_CPU = len("000")
+LEN_PID = len("maxvalue")  # 8
+LEN_TID = len("maxvalue")  # 8
+LEN_COMM = len("max-comms-length")  # 16
+LEN_RUNTIME = len("999999.999")  # 10
+# Support of 3.45 hours of timespans
+LEN_OUT_IN = len("99999999999.999")  # 15
+LEN_OUT_OUT = len("99999999999.999")  # 15
+LEN_IN_IN = len("99999999999.999")  # 15
+LEN_IN_OUT = len("99999999999.999")  # 15
+
+
+# py2/py3 compatibility layer, see PEP469
+try:
+    dict.iteritems
+except AttributeError:
+    # py3
+    def itervalues(d):
+        return iter(d.values())
+
+    def iteritems(d):
+        return iter(d.items())
+
+else:
+    # py2
+    def itervalues(d):
+        return d.itervalues()
+
+    def iteritems(d):
+        return d.iteritems()
+
+
+def _check_color():
+    global _COLORS
+    """user enforced no-color or if stdout is no tty we disable colors"""
+    if sys.stdout.isatty() and args.stdio_color != "never":
+        return
+    _COLORS = {
+        "grey": "",
+        "red": "",
+        "green": "",
+        "yellow": "",
+        "blue": "",
+        "violet": "",
+        "reset": "",
+    }
+
+
+def _parse_args():
+    global args
+    parser = argparse.ArgumentParser(description="Analyze tasks behavior")
+    parser.add_argument(
+        "--time-limit",
+        default=[],
+        help=
+            "print tasks only in time[s] window e.g"
+        " --time-limit 123.111:789.222(print all between 123.111 and 789.222)"
+        " --time-limit 123: (print all from 123)"
+        " --time-limit :456 (print all until incl. 456)",
+    )
+    parser.add_argument(
+        "--summary", action="store_true", help="print addtional runtime information"
+    )
+    parser.add_argument(
+        "--summary-only", action="store_true", help="print only summary without traces"
+    )
+    parser.add_argument(
+        "--summary-extended",
+        action="store_true",
+        help="print the summary with additional information of max inter task times"
+            " relative to the prev task",
+    )
+    parser.add_argument(
+        "--ns", action="store_true", help="show timestamps in nanoseconds"
+    )
+    parser.add_argument(
+        "--ms", action="store_true", help="show timestamps in miliseconds"
+    )
+    parser.add_argument(
+        "--extended-times",
+        action="store_true",
+        help="Show the elapsed times between schedule in/schedule out"
+            " of this task and the schedule in/schedule out of previous occurrence"
+            " of the same task",
+    )
+    parser.add_argument(
+        "--filter-tasks",
+        default=[],
+        help="filter out unneeded tasks by tid, pid or processname."
+        " E.g --filter-task 1337,/sbin/init ",
+    )
+    parser.add_argument(
+        "--limit-to-tasks",
+        default=[],
+        help="limit output to selected task by tid, pid, processname."
+        " E.g --limit-to-tasks 1337,/sbin/init",
+    )
+    parser.add_argument(
+        "--highlight-tasks",
+        default="",
+        help="colorize special tasks by their pid/tid/comm."
+        " E.g. --highlight-tasks 1:red,mutt:yellow"
+        " Colors available: red,grey,yellow,blue,violet,green",
+    )
+    parser.add_argument(
+        "--rename-comms-by-tids",
+        default="",
+        help="rename task names by using tid (<tid>:<newname>,<tid>:<newname>)"
+            " This option is handy for inexpressive processnames like python interpreted"
+            " process. E.g --rename 1337:my-python-app",
+    )
+    parser.add_argument(
+        "--stdio-color",
+        default="auto",
+        choices=["always", "never", "auto"],
+        help="always, never or auto, allowing configuring color output"
+            " via the command line",
+    )
+    parser.add_argument(
+        "--csv",
+        default="",
+        help="Write trace to file selected by user. Options, like --ns or --extended"
+            "-times are used.",
+    )
+    parser.add_argument(
+        "--csv-summary",
+        default="",
+        help="Write summary to file selected by user. Options, like --ns or"
+            " --summary-extended are used.",
+    )
+    args = parser.parse_args()
+    args.tid_renames = dict()
+
+    _argument_filter_sanity_check()
+    _argument_prepare_check()
+
+
+def time_uniter(unit):
+    picker = {
+        "s": 1,
+        "ms": 1e3,
+        "us": 1e6,
+        "ns": 1e9,
+    }
+    return picker[unit]
+
+
+def _init_db():
+    global db
+    db = dict()
+    db["running"] = dict()
+    db["cpu"] = dict()
+    db["tid"] = dict()
+    db["global"] = []
+    if args.summary or args.summary_extended or args.summary_only:
+        db["task_info"] = dict()
+        db["runtime_info"] = dict()
+        # min values for summary depending on the header
+        db["task_info"]["pid"] = len("PID")
+        db["task_info"]["tid"] = len("TID")
+        db["task_info"]["comm"] = len("Comm")
+        db["runtime_info"]["runs"] = len("Runs")
+        db["runtime_info"]["acc"] = len("Accumulated")
+        db["runtime_info"]["max"] = len("Max")
+        db["runtime_info"]["max_at"] = len("Max At")
+        db["runtime_info"]["min"] = len("Min")
+        db["runtime_info"]["mean"] = len("Mean")
+        db["runtime_info"]["median"] = len("Median")
+        if args.summary_extended:
+            db["inter_times"] = dict()
+            db["inter_times"]["out_in"] = len("Out-In")
+            db["inter_times"]["inter_at"] = len("At")
+            db["inter_times"]["out_out"] = len("Out-Out")
+            db["inter_times"]["in_in"] = len("In-In")
+            db["inter_times"]["in_out"] = len("In-Out")
+
+
+def _median(numbers):
+    """phython3 hat statistics module - we have nothing"""
+    n = len(numbers)
+    index = n // 2
+    if n % 2:
+        return sorted(numbers)[index]
+    return sum(sorted(numbers)[index - 1 : index + 1]) / 2
+
+
+def _mean(numbers):
+    return sum(numbers) / len(numbers)
+
+
+class Timespans(object):
+    """
+    The elapsed time between two occurrences of the same task is being tracked with the
+    help of this class. There are 4 of those Timespans Out-Out, In-Out, Out-In and
+    In-In.
+    The first half of the name signals the first time point of the
+    first task. The second half of the name represents the second
+    timepoint of the second task.
+    """
+
+    def __init__(self):
+        self._last_start = None
+        self._last_finish = None
+        self.out_out = -1
+        self.in_out = -1
+        self.out_in = -1
+        self.in_in = -1
+        if args.summary_extended:
+            self._time_in = -1
+            self.max_out_in = -1
+            self.max_at = -1
+            self.max_in_out = -1
+            self.max_in_in = -1
+            self.max_out_out = -1
+
+    def feed(self, task):
+        """
+        Called for every recorded trace event to find process pair and calculate the
+        task timespans. Chronological ordering, feed does not do reordering
+        """
+        if not self._last_finish:
+            self._last_start = task.time_in(time_unit)
+            self._last_finish = task.time_out(time_unit)
+            return
+        self._time_in = task.time_in()
+        time_in = task.time_in(time_unit)
+        time_out = task.time_out(time_unit)
+        self.in_in = time_in - self._last_start
+        self.out_in = time_in - self._last_finish
+        self.in_out = time_out - self._last_start
+        self.out_out = time_out - self._last_finish
+        if args.summary_extended:
+            self._update_max_entries()
+        self._last_finish = task.time_out(time_unit)
+        self._last_start = task.time_in(time_unit)
+
+    def _update_max_entries(self):
+        if self.in_in > self.max_in_in:
+            self.max_in_in = self.in_in
+        if self.out_out > self.max_out_out:
+            self.max_out_out = self.out_out
+        if self.in_out > self.max_in_out:
+            self.max_in_out = self.in_out
+        if self.out_in > self.max_out_in:
+            self.max_out_in = self.out_in
+            self.max_at = self._time_in
+
+
+
+class Summary(object):
+    """
+    Primary instance for calculating the summary output. Processes the whole trace to
+    find and memorize relevant data such as mean, max et cetera. This instance handles
+    dynamic alignment aspects for summary output.
+    """
+
+    def __init__(self):
+        self._body = []
+
+    class AlignmentHelper:
+        """
+        Used to calculated the alignment for the output of the summary.
+        """
+        def __init__(self, pid, tid, comm, runs, acc, mean,
+                    median, min, max, max_at):
+            self.pid = pid
+            self.tid = tid
+            self.comm = comm
+            self.runs = runs
+            self.acc = acc
+            self.mean = mean
+            self.median = median
+            self.min = min
+            self.max = max
+            self.max_at = max_at
+            if args.summary_extended:
+                self.out_in = None
+                self.inter_at = None
+                self.out_out = None
+                self.in_in = None
+                self.in_out = None
+
+    def _print_header(self):
+        '''
+        Output is trimmed in _format_stats thus additional adjustment in the header
+        is needed, depending on the choice of timeunit. The adjustment corresponds
+        to the amount of column titles being adjusted in _column_titles.
+        '''
+        decimal_precision = 6 if not args.ns else 9
+        fmt = " {{:^{}}}".format(sum(db["task_info"].values()))
+        fmt += " {{:^{}}}".format(
+            sum(db["runtime_info"].values()) - 2 * decimal_precision
+            )
+        _header = ("Task Information", "Runtime Information")
+
+        if args.summary_extended:
+            fmt += " {{:^{}}}".format(
+                sum(db["inter_times"].values()) - 4 * decimal_precision
+                )
+            _header += ("Max Inter Task Times",)
+        fd_sum.write(fmt.format(*_header) +  "\n")
+
+    def _column_titles(self):
+        """
+        Cells are being processed and displayed in different way so an alignment adjust
+        is implemented depeding on the choice of the timeunit. The positions of the max
+        values are being displayed in grey. Thus in their format two additional {},
+        are placed for color set and reset.
+        """
+        separator, fix_csv_align = _prepare_fmt_sep()
+        decimal_precision, time_precision = _prepare_fmt_precision()
+        fmt = "{{:>{}}}".format(db["task_info"]["pid"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["task_info"]["tid"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["task_info"]["comm"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["runtime_info"]["runs"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["runtime_info"]["acc"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, db["runtime_info"]["mean"] * fix_csv_align)
+        fmt += "{}{{:>{}}}".format(
+            separator, db["runtime_info"]["median"] * fix_csv_align
+        )
+        fmt += "{}{{:>{}}}".format(
+            separator, (db["runtime_info"]["min"] - decimal_precision) * fix_csv_align
+        )
+        fmt += "{}{{:>{}}}".format(
+            separator, (db["runtime_info"]["max"] - decimal_precision) * fix_csv_align
+        )
+        fmt += "{}{{}}{{:>{}}}{{}}".format(
+            separator, (db["runtime_info"]["max_at"] - time_precision) * fix_csv_align
+        )
+
+        column_titles = ("PID", "TID", "Comm")
+        column_titles += ("Runs", "Accumulated", "Mean", "Median", "Min", "Max")
+        column_titles += (_COLORS["grey"], "Max At", _COLORS["reset"])
+
+        if args.summary_extended:
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["out_in"] - decimal_precision) * fix_csv_align
+            )
+            fmt += "{}{{}}{{:>{}}}{{}}".format(
+                separator,
+                (db["inter_times"]["inter_at"] - time_precision) * fix_csv_align
+            )
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["out_out"] - decimal_precision) * fix_csv_align
+            )
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["in_in"] - decimal_precision) * fix_csv_align
+            )
+            fmt += "{}{{:>{}}}".format(
+                separator,
+                (db["inter_times"]["in_out"] - decimal_precision) * fix_csv_align
+            )
+
+            column_titles += ("Out-In", _COLORS["grey"], "Max At", _COLORS["reset"],
+                        "Out-Out", "In-In", "In-Out")
+
+        fd_sum.write(fmt.format(*column_titles) + "\n")
+
+
+    def _task_stats(self):
+        """calculates the stats of every task and constructs the printable summary"""
+        for tid in sorted(db["tid"]):
+            color_one_sample = _COLORS["grey"]
+            color_reset = _COLORS["reset"]
+            no_executed = 0
+            runtimes = []
+            time_in = []
+            timespans = Timespans()
+            for task in db["tid"][tid]:
+                pid = task.pid
+                comm = task.comm
+                no_executed += 1
+                runtimes.append(task.runtime(time_unit))
+                time_in.append(task.time_in())
+                timespans.feed(task)
+            if len(runtimes) > 1:
+                color_one_sample = ""
+                color_reset = ""
+            time_max = max(runtimes)
+            time_min = min(runtimes)
+            max_at = time_in[runtimes.index(max(runtimes))]
+
+            # The size of the decimal after sum,mean and median varies, thus we cut
+            # the decimal number, by rounding it. It has no impact on the output,
+            # because we have a precision of the decimal points at the output.
+            time_sum = round(sum(runtimes), 3)
+            time_mean = round(_mean(runtimes), 3)
+            time_median = round(_median(runtimes), 3)
+
+            align_helper = self.AlignmentHelper(pid, tid, comm, no_executed, time_sum,
+                                    time_mean, time_median, time_min, time_max, max_at)
+            self._body.append([pid, tid, comm, no_executed, time_sum, color_one_sample,
+                                time_mean, time_median, time_min, time_max,
+                                _COLORS["grey"], max_at, _COLORS["reset"], color_reset])
+            if args.summary_extended:
+                self._body[-1].extend([timespans.max_out_in,
+                                _COLORS["grey"], timespans.max_at,
+                                _COLORS["reset"], timespans.max_out_out,
+                                timespans.max_in_in,
+                                timespans.max_in_out])
+                align_helper.out_in = timespans.max_out_in
+                align_helper.inter_at = timespans.max_at
+                align_helper.out_out = timespans.max_out_out
+                align_helper.in_in = timespans.max_in_in
+                align_helper.in_out = timespans.max_in_out
+            self._calc_alignments_summary(align_helper)
+
+    def _format_stats(self):
+        separator, fix_csv_align = _prepare_fmt_sep()
+        decimal_precision, time_precision = _prepare_fmt_precision()
+        len_pid = db["task_info"]["pid"] * fix_csv_align
+        len_tid = db["task_info"]["tid"] * fix_csv_align
+        len_comm = db["task_info"]["comm"] * fix_csv_align
+        len_runs = db["runtime_info"]["runs"] * fix_csv_align
+        len_acc = db["runtime_info"]["acc"] * fix_csv_align
+        len_mean = db["runtime_info"]["mean"] * fix_csv_align
+        len_median = db["runtime_info"]["median"] * fix_csv_align
+        len_min = (db["runtime_info"]["min"] - decimal_precision) * fix_csv_align
+        len_max = (db["runtime_info"]["max"] - decimal_precision) * fix_csv_align
+        len_max_at = (db["runtime_info"]["max_at"] - time_precision) * fix_csv_align
+        if args.summary_extended:
+            len_out_in = (
+                db["inter_times"]["out_in"] - decimal_precision
+            ) * fix_csv_align
+            len_inter_at = (
+                db["inter_times"]["inter_at"] - time_precision
+            ) * fix_csv_align
+            len_out_out = (
+                db["inter_times"]["out_out"] - decimal_precision
+            ) * fix_csv_align
+            len_in_in = (db["inter_times"]["in_in"] - decimal_precision) * fix_csv_align
+            len_in_out = (
+                db["inter_times"]["in_out"] - decimal_precision
+            ) * fix_csv_align
+
+        fmt = "{{:{}d}}".format(len_pid)
+        fmt += "{}{{:{}d}}".format(separator, len_tid)
+        fmt += "{}{{:>{}}}".format(separator, len_comm)
+        fmt += "{}{{:{}d}}".format(separator, len_runs)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_acc, time_precision)
+        fmt += "{}{{}}{{:{}.{}f}}".format(separator, len_mean, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_median, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_min, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, len_max, time_precision)
+        fmt += "{}{{}}{{:{}.{}f}}{{}}{{}}".format(
+            separator, len_max_at, decimal_precision
+        )
+        if args.summary_extended:
+            fmt += "{}{{:{}.{}f}}".format(separator, len_out_in, time_precision)
+            fmt += "{}{{}}{{:{}.{}f}}{{}}".format(
+                separator, len_inter_at, decimal_precision
+            )
+            fmt += "{}{{:{}.{}f}}".format(separator, len_out_out, time_precision)
+            fmt += "{}{{:{}.{}f}}".format(separator, len_in_in, time_precision)
+            fmt += "{}{{:{}.{}f}}".format(separator, len_in_out, time_precision)
+        return fmt
+
+
+    def _calc_alignments_summary(self, align_helper):
+        # Length is being cut in 3 groups so that further addition is easier to handle.
+        # The length of every argument from the alignment helper is being checked if it
+        # is longer than the longest until now. In that case the length is being saved.
+        for key in db["task_info"]:
+            if len(str(getattr(align_helper, key))) > db["task_info"][key]:
+                db["task_info"][key] = len(str(getattr(align_helper, key)))
+        for key in db["runtime_info"]:
+            if len(str(getattr(align_helper, key))) > db["runtime_info"][key]:
+                db["runtime_info"][key] = len(str(getattr(align_helper, key)))
+        if args.summary_extended:
+            for key in db["inter_times"]:
+                if len(str(getattr(align_helper, key))) > db["inter_times"][key]:
+                    db["inter_times"][key] = len(str(getattr(align_helper, key)))
+
+
+    def print(self):
+        self._task_stats()
+        fmt = self._format_stats()
+
+        if not args.csv_summary:
+            print("\nSummary")
+            self._print_header()
+        self._column_titles()
+        for i in range(len(self._body)):
+            fd_sum.write(fmt.format(*tuple(self._body[i])) + "\n")
+
+
+
+class Task(object):
+    """ The class is used to handle the information of a given task."""
+
+    def __init__(self, id, tid, cpu, comm):
+        self.id = id
+        self.tid = tid
+        self.cpu = cpu
+        self.comm = comm
+        self.pid = None
+        self._time_in = None
+        self._time_out = None
+
+    def schedule_in_at(self, time):
+        """set the time where the task was scheduled in"""
+        self._time_in = time
+
+    def schedule_out_at(self, time):
+        """set the time where the task was scheduled out"""
+        self._time_out = time
+
+    def time_out(self, unit="s"):
+        """return time where a given task was scheduled out"""
+        factor = time_uniter(unit)
+        return self._time_out * decimal.Decimal(factor)
+
+    def time_in(self, unit="s"):
+        """return time where a given task was scheduled in"""
+        factor = time_uniter(unit)
+        return self._time_in * decimal.Decimal(factor)
+
+    def runtime(self, unit="us"):
+        factor = time_uniter(unit)
+        return (self._time_out - self._time_in) * decimal.Decimal(factor)
+
+    def update_pid(self, pid):
+        self.pid = pid
+
+
+def _task_id(pid, cpu):
+    """returns a "unique-enough" identifier, please do not change"""
+    return "{}-{}".format(pid, cpu)
+
+
+def _filter_non_printable(unfiltered):
+    """comm names may contain loony chars like '\x00000'"""
+    filtered = ""
+    for char in unfiltered:
+        if char not in string.printable:
+            continue
+        filtered += char
+    return filtered
+
+
+def _fmt_header():
+    separator, fix_csv_align = _prepare_fmt_sep()
+    fmt = "{{:>{}}}".format(LEN_SWITCHED_IN*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_SWITCHED_OUT*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_CPU*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_PID*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_TID*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_COMM*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_RUNTIME*fix_csv_align)
+    fmt += "{}{{:>{}}}".format(separator, LEN_OUT_IN*fix_csv_align)
+    if args.extended_times:
+        fmt += "{}{{:>{}}}".format(separator, LEN_OUT_OUT*fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, LEN_IN_IN*fix_csv_align)
+        fmt += "{}{{:>{}}}".format(separator, LEN_IN_OUT*fix_csv_align)
+    return fmt
+
+
+def _fmt_body():
+    separator, fix_csv_align = _prepare_fmt_sep()
+    decimal_precision, time_precision = _prepare_fmt_precision()
+    fmt = "{{}}{{:{}.{}f}}".format(LEN_SWITCHED_IN*fix_csv_align, decimal_precision)
+    fmt += "{}{{:{}.{}f}}".format(
+        separator, LEN_SWITCHED_OUT*fix_csv_align, decimal_precision
+    )
+    fmt += "{}{{:{}d}}".format(separator, LEN_CPU*fix_csv_align)
+    fmt += "{}{{:{}d}}".format(separator, LEN_PID*fix_csv_align)
+    fmt += "{}{{}}{{:{}d}}{{}}".format(separator, LEN_TID*fix_csv_align)
+    fmt += "{}{{}}{{:>{}}}".format(separator, LEN_COMM*fix_csv_align)
+    fmt += "{}{{:{}.{}f}}".format(separator, LEN_RUNTIME*fix_csv_align, time_precision)
+    if args.extended_times:
+        fmt += "{}{{:{}.{}f}}".format(separator, LEN_OUT_IN*fix_csv_align, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, LEN_OUT_OUT*fix_csv_align, time_precision)
+        fmt += "{}{{:{}.{}f}}".format(separator, LEN_IN_IN*fix_csv_align, time_precision)
+        fmt += "{}{{:{}.{}f}}{{}}".format(
+            separator, LEN_IN_OUT*fix_csv_align, time_precision
+        )
+    else:
+        fmt += "{}{{:{}.{}f}}{{}}".format(
+            separator, LEN_OUT_IN*fix_csv_align, time_precision
+        )
+    return fmt
+
+
+def _print_header():
+    fmt = _fmt_header()
+    header = ("Switched-In", "Switched-Out", "CPU", "PID", "TID", "Comm", "Runtime",
+            "Time Out-In")
+    if args.extended_times:
+        header += ("Time Out-Out", "Time In-In", "Time In-Out")
+    fd_task.write(fmt.format(*header) + "\n")
+
+
+
+def _print_task_finish(task):
+    """calculating every entry of a row and printing it immediately"""
+    c_row_set = ""
+    c_row_reset = ""
+    out_in = -1
+    out_out = -1
+    in_in = -1
+    in_out = -1
+    fmt = _fmt_body()
+    # depending on user provided highlight option we change the color
+    # for particular tasks
+    if str(task.tid) in args.highlight_tasks_map:
+        c_row_set = _COLORS[args.highlight_tasks_map[str(task.tid)]]
+        c_row_reset = _COLORS["reset"]
+    if task.comm in args.highlight_tasks_map:
+        c_row_set = _COLORS[args.highlight_tasks_map[task.comm]]
+        c_row_reset = _COLORS["reset"]
+    # grey-out entries if PID == TID, they
+    # are identical, no threaded model so the
+    # thread id (tid) do not matter
+    c_tid_set = ""
+    c_tid_reset = ""
+    if task.pid == task.tid:
+        c_tid_set = _COLORS["grey"]
+        c_tid_reset = _COLORS["reset"]
+    if task.tid in db["tid"]:
+        # get last task of tid
+        last_tid_task = db["tid"][task.tid][-1]
+        # feed the timespan calculate, last in tid db
+        # and second the current one
+        timespan_gap_tid = Timespans()
+        timespan_gap_tid.feed(last_tid_task)
+        timespan_gap_tid.feed(task)
+        out_in = timespan_gap_tid.out_in
+        out_out = timespan_gap_tid.out_out
+        in_in = timespan_gap_tid.in_in
+        in_out = timespan_gap_tid.in_out
+
+
+    if args.extended_times:
+        line_out = fmt.format(c_row_set, task.time_in(), task.time_out(), task.cpu,
+                        task.pid, c_tid_set, task.tid, c_tid_reset, c_row_set, task.comm,
+                        task.runtime(time_unit), out_in, out_out, in_in, in_out,
+                        c_row_reset) + "\n"
+    else:
+        line_out = fmt.format(c_row_set, task.time_in(), task.time_out(), task.cpu,
+                        task.pid, c_tid_set, task.tid, c_tid_reset, c_row_set, task.comm,
+                        task.runtime(time_unit), out_in, c_row_reset) + "\n"
+    try:
+        fd_task.write(line_out)
+    except(IOError):
+        # don't mangle the output if user SIGINT this script
+        sys.exit()
+
+def _record_cleanup(_list):
+    """
+    no need to store more then one element if --summarize
+    is not enabled
+    """
+    if not args.summary and len(_list) > 1:
+        _list = _list[len(_list) - 1 :]
+
+
+def _record_by_tid(task):
+    tid = task.tid
+    if tid not in db["tid"]:
+        db["tid"][tid] = []
+    db["tid"][tid].append(task)
+    _record_cleanup(db["tid"][tid])
+
+
+def _record_by_cpu(task):
+    cpu = task.cpu
+    if cpu not in db["cpu"]:
+        db["cpu"][cpu] = []
+    db["cpu"][cpu].append(task)
+    _record_cleanup(db["cpu"][cpu])
+
+
+def _record_global(task):
+    """record all executed task, ordered by finish chronological"""
+    db["global"].append(task)
+    _record_cleanup(db["global"])
+
+
+def _handle_task_finish(tid, cpu, time, perf_sample_dict):
+    if tid == 0:
+        return
+    _id = _task_id(tid, cpu)
+    if _id not in db["running"]:
+        # may happen, if we missed the switch to
+        # event. Seen in combination with --exclude-perf
+        # where the start is filtered out, but not the
+        # switched in. Probably a bug in exclude-perf
+        # option.
+        return
+    task = db["running"][_id]
+    task.schedule_out_at(time)
+
+    # record tid, during schedule in the tid
+    # is not available, update now
+    pid = int(perf_sample_dict["sample"]["pid"])
+
+    task.update_pid(pid)
+    del db["running"][_id]
+
+    # print only tasks which are not being filtered and no print of trace
+    # for summary only, but record every task.
+    if not _limit_filtered(tid, pid, task.comm) and not args.summary_only:
+        _print_task_finish(task)
+    _record_by_tid(task)
+    _record_by_cpu(task)
+    _record_global(task)
+
+
+def _handle_task_start(tid, cpu, comm, time):
+    if tid == 0:
+        return
+    if tid in args.tid_renames:
+        comm = args.tid_renames[tid]
+    _id = _task_id(tid, cpu)
+    if _id in db["running"]:
+        # handle corner cases where already running tasks
+        # are switched-to again - saw this via --exclude-perf
+        # recorded traces. We simple ignore this "second start"
+        # event.
+        return
+    assert _id not in db["running"]
+    task = Task(_id, tid, cpu, comm)
+    task.schedule_in_at(time)
+    db["running"][_id] = task
+
+
+def _time_to_internal(time_ns):
+    """
+    To prevent float rounding errors we use Decimal internally
+    """
+    return decimal.Decimal(time_ns) / decimal.Decimal(1e9)
+
+
+def _limit_filtered(tid, pid, comm):
+    if args.filter_tasks:
+        if str(tid) in args.filter_tasks or comm in args.filter_tasks:
+            return True
+        else:
+            return False
+    if args.limit_to_tasks:
+        if str(tid) in args.limit_to_tasks or comm in args.limit_to_tasks:
+            return False
+        else:
+            return True
+
+
+def _argument_filter_sanity_check():
+    if args.limit_to_tasks and args.filter_tasks:
+        sys.exit("Error: Filter and Limit at the same time active.")
+    if args.extended_times and args.summary_only:
+        sys.exit("Error: Summary only and extended times active.")
+    if args.time_limit and ":" not in args.time_limit:
+        sys.exit(
+            "Error: No bound set for time limit. Please set bound by ':' e.g :123."
+        )
+    if args.time_limit and (args.summary or args.summary_only or args.summary_extended):
+        sys.exit("Error: Cannot set time limit and print summary")
+    if args.csv_summary:
+        args.summary = True
+        if args.csv == args.csv_summary:
+            sys.exit("Error: Chosen files for csv and csv summary are the same")
+    if args.csv and (args.summary_extended or args.summary) and not args.csv_summary:
+        sys.exit("Error: No file chosen to write summary to. Choose with --csv-summary "
+        "<file>")
+    if args.csv and args.summary_only:
+        sys.exit("Error: --csv chosen and --summary-only. Standard task would not be"
+            "written to csv file.")
+
+def _argument_prepare_check():
+    global time_unit, fd_task, fd_sum
+    if args.filter_tasks:
+        args.filter_tasks = args.filter_tasks.split(",")
+    if args.limit_to_tasks:
+        args.limit_to_tasks = args.limit_to_tasks.split(",")
+    if args.time_limit:
+        args.time_limit = args.time_limit.split(":")
+    for rename_tuple in args.rename_comms_by_tids.split(","):
+        tid_name = rename_tuple.split(":")
+        if len(tid_name) != 2:
+            continue
+        args.tid_renames[int(tid_name[0])] = tid_name[1]
+    args.highlight_tasks_map = dict()
+    for highlight_tasks_tuple in args.highlight_tasks.split(","):
+        tasks_color_map = highlight_tasks_tuple.split(":")
+        # default highlight color to red if no color set by user
+        if len(tasks_color_map) == 1:
+            tasks_color_map.append("red")
+        if args.highlight_tasks and tasks_color_map[1].lower() not in _COLORS:
+            sys.exit(
+                "Error: Color not defined, please choose from grey,red,green,yellow,blue,"
+                "violet"
+            )
+        if len(tasks_color_map) != 2:
+            continue
+        args.highlight_tasks_map[tasks_color_map[0]] = tasks_color_map[1]
+    time_unit = "us"
+    if args.ns:
+        time_unit = "ns"
+    elif args.ms:
+        time_unit = "ms"
+
+
+    fd_task = sys.stdout
+    if args.csv:
+        args.stdio_color = "never"
+        fd_task = open(args.csv, "w")
+        print("generating csv at",args.csv,)
+
+    fd_sum = sys.stdout
+    if args.csv_summary:
+        args.stdio_color = "never"
+        fd_sum = open(args.csv_summary, "w")
+        print("generating csv summary at",args.csv_summary)
+        if not args.csv:
+            args.summary_only = True
+
+
+def _is_within_timelimit(time):
+    """
+    Check if a time limit was given by parameter, if so ignore the rest. If not,
+    process the recorded trace in its entirety.
+    """
+    if not args.time_limit:
+        return True
+    lower_time_limit = args.time_limit[0]
+    upper_time_limit = args.time_limit[1]
+    # check for upper limit
+    if upper_time_limit == "":
+        if time >= decimal.Decimal(lower_time_limit):
+            return True
+    # check for lower limit
+    if lower_time_limit == "":
+        if time <= decimal.Decimal(upper_time_limit):
+            return True
+        # quit if time exceeds upper limit. Good for big datasets
+        else:
+            quit()
+    if lower_time_limit != "" and upper_time_limit != "":
+        if (time >= decimal.Decimal(lower_time_limit) and
+            time <= decimal.Decimal(upper_time_limit)):
+            return True
+        # quit if time exceeds upper limit. Good for big datasets
+        elif time > decimal.Decimal(upper_time_limit):
+            quit()
+
+def _prepare_fmt_precision():
+    decimal_precision = 6
+    time_precision = 3
+    if args.ns:
+     decimal_precision = 9
+     time_precision = 0
+    return decimal_precision, time_precision
+
+def _prepare_fmt_sep():
+    separator = " "
+    fix_csv_align = 1
+    if args.csv or args.csv_summary:
+        separator = ";"
+        fix_csv_align = 0
+    return separator, fix_csv_align
+
+def trace_unhandled(event_name, context, event_fields_dict, perf_sample_dict):
+    pass
+
+
+def trace_begin():
+    _parse_args()
+    _check_color()
+    _init_db()
+    if not args.summary_only:
+        _print_header()
+
+def trace_end():
+    if args.summary or args.summary_extended or args.summary_only:
+        Summary().print()
+
+def sched__sched_switch(event_name, context, common_cpu, common_secs, common_nsecs,
+                        common_pid, common_comm, common_callchain, prev_comm,
+                        prev_pid, prev_prio, prev_state, next_comm, next_pid,
+                        next_prio, perf_sample_dict):
+    # ignore common_secs & common_nsecs cause we need
+    # high res timestamp anyway, using the raw value is
+    # faster
+    time = _time_to_internal(perf_sample_dict["sample"]["time"])
+    if not _is_within_timelimit(time):
+        # user specific --time-limit a:b set
+        return
+
+    next_comm = _filter_non_printable(next_comm)
+    _handle_task_finish(prev_pid, common_cpu, time, perf_sample_dict)
+    _handle_task_start(next_pid, common_cpu, next_comm, time)
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 2064a640facb..90fd1eb317bb 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -6,13 +6,13 @@ perf-y += parse-events.o
 perf-y += dso-data.o
 perf-y += attr.o
 perf-y += vmlinux-kallsyms.o
-perf-y += openat-syscall.o
-perf-y += openat-syscall-all-cpus.o
-perf-y += openat-syscall-tp-fields.o
-perf-y += mmap-basic.o
+perf-$(CONFIG_LIBTRACEEVENT) += openat-syscall.o
+perf-$(CONFIG_LIBTRACEEVENT) += openat-syscall-all-cpus.o
+perf-$(CONFIG_LIBTRACEEVENT) += openat-syscall-tp-fields.o
+perf-$(CONFIG_LIBTRACEEVENT) += mmap-basic.o
 perf-y += perf-record.o
 perf-y += evsel-roundtrip-name.o
-perf-y += evsel-tp-sched.o
+perf-$(CONFIG_LIBTRACEEVENT) += evsel-tp-sched.o
 perf-y += fdarray.o
 perf-y += pmu.o
 perf-y += pmu-events.o
@@ -30,7 +30,7 @@ perf-y += task-exit.o
 perf-y += sw-clock.o
 perf-y += mmap-thread-lookup.o
 perf-y += thread-maps-share.o
-perf-y += switch-tracking.o
+perf-$(CONFIG_LIBTRACEEVENT) += switch-tracking.o
 perf-y += keep-tracking.o
 perf-y += code-reading.o
 perf-y += sample-parsing.o
@@ -67,6 +67,7 @@ perf-y += expand-cgroup.o
 perf-y += perf-time-to-tsc.o
 perf-y += dlfilter-test.o
 perf-y += sigtrap.o
+perf-y += event_groups.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
@@ -103,3 +104,5 @@ endif
 CFLAGS_attr.o         += -DBINDIR="BUILD_STR($(bindir_SQ))" -DPYTHON="BUILD_STR($(PYTHON_WORD))"
 CFLAGS_python-use.o   += -DPYTHONPATH="BUILD_STR($(OUTPUT)python)" -DPYTHON="BUILD_STR($(PYTHON_WORD))"
 CFLAGS_dwarf-unwind.o += -fno-optimize-sibling-calls
+
+perf-y += workloads/
diff --git a/tools/perf/tests/attr.py b/tools/perf/tests/attr.py
index cb39ac46bc73..ccfef861e931 100644
--- a/tools/perf/tests/attr.py
+++ b/tools/perf/tests/attr.py
@@ -6,9 +6,12 @@ import os
 import sys
 import glob
 import optparse
+import platform
 import tempfile
 import logging
+import re
 import shutil
+import subprocess
 
 try:
     import configparser
@@ -123,17 +126,27 @@ class Event(dict):
             if not data_equal(self[t], other[t]):
                 log.warning("expected %s=%s, got %s" % (t, self[t], other[t]))
 
+def parse_version(version):
+    if not version:
+        return None
+    return [int(v) for v in version.split(".")[0:2]]
+
 # Test file description needs to have following sections:
 # [config]
 #   - just single instance in file
 #   - needs to specify:
 #     'command' - perf command name
 #     'args'    - special command arguments
-#     'ret'     - expected command return value (0 by default)
+#     'ret'     - Skip test if Perf doesn't exit with this value (0 by default)
+#     'test_ret'- If set to 'true', fail test instead of skipping for 'ret' argument
 #     'arch'    - architecture specific test (optional)
 #                 comma separated list, ! at the beginning
 #                 negates it.
-#
+#     'auxv'    - Truthy statement that is evaled in the scope of the auxv map. When false,
+#                 the test is skipped. For example 'auxv["AT_HWCAP"] == 10'. (optional)
+#     'kernel_since' - Inclusive kernel version from which the test will start running. Only the
+#                      first two values are supported, for example "6.1" (optional)
+#     'kernel_until' - Exclusive kernel version from which the test will stop running. (optional)
 # [eventX:base]
 #   - one or multiple instances in file
 #   - expected values assignments
@@ -155,12 +168,17 @@ class Test(object):
         except:
             self.ret  = 0
 
+        self.test_ret = parser.getboolean('config', 'test_ret', fallback=False)
+
         try:
             self.arch  = parser.get('config', 'arch')
             log.warning("test limitation '%s'" % self.arch)
         except:
             self.arch  = ''
 
+        self.auxv = parser.get('config', 'auxv', fallback=None)
+        self.kernel_since = parse_version(parser.get('config', 'kernel_since', fallback=None))
+        self.kernel_until = parse_version(parser.get('config', 'kernel_until', fallback=None))
         self.expect   = {}
         self.result   = {}
         log.debug("  loading expected events");
@@ -172,7 +190,38 @@ class Test(object):
         else:
             return True
 
-    def skip_test(self, myarch):
+    def skip_test_kernel_since(self):
+        if not self.kernel_since:
+            return False
+        return not self.kernel_since <= parse_version(platform.release())
+
+    def skip_test_kernel_until(self):
+        if not self.kernel_until:
+            return False
+        return not parse_version(platform.release()) < self.kernel_until
+
+    def skip_test_auxv(self):
+        def new_auxv(a, pattern):
+            items = list(filter(None, pattern.split(a)))
+            # AT_HWCAP is hex but doesn't have a prefix, so special case it
+            if items[0] == "AT_HWCAP":
+                value = int(items[-1], 16)
+            else:
+                try:
+                    value = int(items[-1], 0)
+                except:
+                    value = items[-1]
+            return (items[0], value)
+
+        if not self.auxv:
+            return False
+        auxv = subprocess.check_output("LD_SHOW_AUXV=1 sleep 0", shell=True) \
+               .decode(sys.stdout.encoding)
+        pattern = re.compile(r"[: ]+")
+        auxv = dict([new_auxv(a, pattern) for a in auxv.splitlines()])
+        return not eval(self.auxv)
+
+    def skip_test_arch(self, myarch):
         # If architecture not set always run test
         if self.arch == '':
             # log.warning("test for arch %s is ok" % myarch)
@@ -222,9 +271,18 @@ class Test(object):
     def run_cmd(self, tempdir):
         junk1, junk2, junk3, junk4, myarch = (os.uname())
 
-        if self.skip_test(myarch):
+        if self.skip_test_arch(myarch):
             raise Notest(self, myarch)
 
+        if self.skip_test_auxv():
+            raise Notest(self, "auxv skip")
+
+        if self.skip_test_kernel_since():
+            raise Notest(self, "old kernel skip")
+
+        if self.skip_test_kernel_until():
+            raise Notest(self, "new kernel skip")
+
         cmd = "PERF_TEST_ATTR=%s %s %s -o %s/perf.data %s" % (tempdir,
               self.perf, self.command, tempdir, self.args)
         ret = os.WEXITSTATUS(os.system(cmd))
@@ -232,7 +290,10 @@ class Test(object):
         log.info("  '%s' ret '%s', expected '%s'" % (cmd, str(ret), str(self.ret)))
 
         if not data_equal(str(ret), str(self.ret)):
-            raise Unsup(self)
+            if self.test_ret:
+                raise Fail(self, "Perf exit code failure")
+            else:
+                raise Unsup(self)
 
     def compare(self, expect, result):
         match = {}
diff --git a/tools/perf/tests/attr/README b/tools/perf/tests/attr/README
index eb3f7d4bb324..4066fec7180a 100644
--- a/tools/perf/tests/attr/README
+++ b/tools/perf/tests/attr/README
@@ -49,7 +49,6 @@ Following tests are defined (with perf commands):
   perf record --call-graph dwarf kill		(test-record-graph-dwarf)
   perf record --call-graph fp kill              (test-record-graph-fp)
   perf record --call-graph fp kill              (test-record-graph-fp-aarch64)
-  perf record --group -e cycles,instructions kill (test-record-group)
   perf record -e '{cycles,instructions}' kill   (test-record-group1)
   perf record -e '{cycles/period=1/,instructions/period=2/}:S' kill (test-record-group2)
   perf record -D kill                           (test-record-no-delay)
@@ -66,6 +65,5 @@ Following tests are defined (with perf commands):
   perf stat -d kill                             (test-stat-detailed-1)
   perf stat -dd kill                            (test-stat-detailed-2)
   perf stat -ddd kill                           (test-stat-detailed-3)
-  perf stat --group -e cycles,instructions kill (test-stat-group)
   perf stat -e '{cycles,instructions}' kill     (test-stat-group1)
   perf stat -i -e cycles kill                   (test-stat-no-inherit)
diff --git a/tools/perf/tests/attr/test-record-group b/tools/perf/tests/attr/test-record-group
deleted file mode 100644
index 6c1cff8aae8b..000000000000
--- a/tools/perf/tests/attr/test-record-group
+++ /dev/null
@@ -1,22 +0,0 @@
-[config]
-command = record
-args    = --no-bpf-event --group -e cycles,instructions kill >/dev/null 2>&1
-ret     = 1
-
-[event-1:base-record]
-fd=1
-group_fd=-1
-sample_type=327
-read_format=4|20
-
-[event-2:base-record]
-fd=2
-group_fd=1
-config=1
-sample_type=327
-read_format=4|20
-mmap=0
-comm=0
-task=0
-enable_on_exec=0
-disabled=0
diff --git a/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64 b/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64
new file mode 100644
index 000000000000..fbb065842880
--- /dev/null
+++ b/tools/perf/tests/attr/test-record-user-regs-no-sve-aarch64
@@ -0,0 +1,9 @@
+# Test that asking for VG fails if the system doesn't support SVE. This
+# applies both before and after the feature was added in 6.1
+[config]
+command = record
+args    = --no-bpf-event --user-regs=vg kill >/dev/null 2>&1
+ret     = 129
+test_ret = true
+arch    = aarch64
+auxv    = auxv["AT_HWCAP"] & 0x200000 == 0
diff --git a/tools/perf/tests/attr/test-record-user-regs-old-sve-aarch64 b/tools/perf/tests/attr/test-record-user-regs-old-sve-aarch64
new file mode 100644
index 000000000000..15ebfc3418e3
--- /dev/null
+++ b/tools/perf/tests/attr/test-record-user-regs-old-sve-aarch64
@@ -0,0 +1,10 @@
+# Test that asking for VG always fails on old kernels because it was
+# added in 6.1. This applies to systems that either support or don't
+# support SVE.
+[config]
+command = record
+args    = --no-bpf-event --user-regs=vg kill >/dev/null 2>&1
+ret     = 129
+test_ret = true
+arch    = aarch64
+kernel_until = 6.1
diff --git a/tools/perf/tests/attr/test-record-user-regs-sve-aarch64 b/tools/perf/tests/attr/test-record-user-regs-sve-aarch64
new file mode 100644
index 000000000000..c598c803221d
--- /dev/null
+++ b/tools/perf/tests/attr/test-record-user-regs-sve-aarch64
@@ -0,0 +1,14 @@
+# Test that asking for VG works if the system has SVE and after the
+# feature was added in 6.1
+[config]
+command = record
+args    = --no-bpf-event --user-regs=vg kill >/dev/null 2>&1
+ret     = 1
+test_ret = true
+arch    = aarch64
+auxv    = auxv["AT_HWCAP"] & 0x200000 == 0x200000
+kernel_since = 6.1
+
+[event:base-record]
+sample_type=4359
+sample_regs_user=70368744177664
diff --git a/tools/perf/tests/attr/test-stat-group b/tools/perf/tests/attr/test-stat-group
deleted file mode 100644
index e15d6946e9b3..000000000000
--- a/tools/perf/tests/attr/test-stat-group
+++ /dev/null
@@ -1,17 +0,0 @@
-[config]
-command = stat
-args    = --group -e cycles,instructions kill >/dev/null 2>&1
-ret     = 1
-
-[event-1:base-stat]
-fd=1
-group_fd=-1
-read_format=3|15
-
-[event-2:base-stat]
-fd=2
-group_fd=1
-config=1
-disabled=0
-enable_on_exec=0
-read_format=3|15
diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 4965dd666956..0173f5402a35 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -18,7 +18,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 
 	if (map && bm) {
 		for (i = 0; i < perf_cpu_map__nr(map); i++)
-			set_bit(perf_cpu_map__cpu(map, i).cpu, bm);
+			__set_bit(perf_cpu_map__cpu(map, i).cpu, bm);
 	}
 
 	if (map)
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 7122eae1d98d..f6c16ad8ed50 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -38,9 +38,11 @@ struct test_suite *__weak arch_tests[] = {
 
 static struct test_suite *generic_tests[] = {
 	&suite__vmlinux_matches_kallsyms,
+#ifdef HAVE_LIBTRACEEVENT
 	&suite__openat_syscall_event,
 	&suite__openat_syscall_event_on_all_cpus,
 	&suite__basic_mmap,
+#endif
 	&suite__mem,
 	&suite__parse_events,
 	&suite__expr,
@@ -51,8 +53,10 @@ static struct test_suite *generic_tests[] = {
 	&suite__dso_data_cache,
 	&suite__dso_data_reopen,
 	&suite__perf_evsel__roundtrip_name_test,
+#ifdef HAVE_LIBTRACEEVENT
 	&suite__perf_evsel__tp_sched_test,
 	&suite__syscall_openat_tp_fields,
+#endif
 	&suite__attr,
 	&suite__hists_link,
 	&suite__python_use,
@@ -71,7 +75,9 @@ static struct test_suite *generic_tests[] = {
 	&suite__thread_maps_share,
 	&suite__hists_output,
 	&suite__hists_cumulate,
+#ifdef HAVE_LIBTRACEEVENT
 	&suite__switch_tracking,
+#endif
 	&suite__fdarray__filter,
 	&suite__fdarray__add,
 	&suite__kmod_path__parse,
@@ -110,6 +116,7 @@ static struct test_suite *generic_tests[] = {
 	&suite__perf_time_to_tsc,
 	&suite__dlfilter,
 	&suite__sigtrap,
+	&suite__event_groups,
 	NULL,
 };
 
@@ -118,6 +125,15 @@ static struct test_suite **tests[] = {
 	arch_tests,
 };
 
+static struct test_workload *workloads[] = {
+	&workload__noploop,
+	&workload__thloop,
+	&workload__leafloop,
+	&workload__sqrtloop,
+	&workload__brstack,
+	&workload__datasym,
+};
+
 static int num_subtests(const struct test_suite *t)
 {
 	int num;
@@ -475,6 +491,21 @@ static int perf_test__list(int argc, const char **argv)
 	return 0;
 }
 
+static int run_workload(const char *work, int argc, const char **argv)
+{
+	unsigned int i = 0;
+	struct test_workload *twl;
+
+	for (i = 0; i < ARRAY_SIZE(workloads); i++) {
+		twl = workloads[i];
+		if (!strcmp(twl->name, work))
+			return twl->func(argc, argv);
+	}
+
+	pr_info("No workload found: %s\n", work);
+	return -1;
+}
+
 int cmd_test(int argc, const char **argv)
 {
 	const char *test_usage[] = {
@@ -482,12 +513,14 @@ int cmd_test(int argc, const char **argv)
 	NULL,
 	};
 	const char *skip = NULL;
+	const char *workload = NULL;
 	const struct option test_options[] = {
 	OPT_STRING('s', "skip", &skip, "tests", "tests to skip"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('F', "dont-fork", &dont_fork,
 		    "Do not fork for testcase"),
+	OPT_STRING('w', "workload", &workload, "work", "workload to run for testing"),
 	OPT_END()
 	};
 	const char * const test_subcommands[] = { "list", NULL };
@@ -504,6 +537,9 @@ int cmd_test(int argc, const char **argv)
 	if (argc >= 1 && !strcmp(argv[0], "list"))
 		return perf_test__list(argc - 1, argv + 1);
 
+	if (workload)
+		return run_workload(workload, argc, argv);
+
 	symbol_conf.priv_size = sizeof(int);
 	symbol_conf.sort_by_name = true;
 	symbol_conf.try_vmlinux_path = true;
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 95feb6ef34a0..cb8cd09938d5 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -16,7 +16,6 @@
 #include "dso.h"
 #include "env.h"
 #include "parse-events.h"
-#include "trace-event.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
@@ -28,6 +27,7 @@
 #include "util/mmap.h"
 #include "util/string2.h"
 #include "util/synthetic-events.h"
+#include "util/util.h"
 #include "thread.h"
 
 #include "tests.h"
@@ -79,7 +79,7 @@ static size_t read_objdump_chunk(const char **line, unsigned char **buf,
 	 * see disassemble_bytes() at binutils/objdump.c for details
 	 * how objdump chooses display endian)
 	 */
-	if (bytes_read > 1 && !bigendian()) {
+	if (bytes_read > 1 && !host_is_bigendian()) {
 		unsigned char *chunk_end = chunk_start + bytes_read - 1;
 		unsigned char tmp;
 
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index 7c873c6ae3eb..3150fc1fed6f 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -6,7 +6,7 @@
 #include "util/synthetic-events.h"
 #include <string.h>
 #include <linux/bitops.h>
-#include <perf/cpumap.h>
+#include <internal/cpumap.h>
 #include "debug.h"
 
 struct machine;
diff --git a/tools/perf/tests/dlfilter-test.c b/tools/perf/tests/dlfilter-test.c
index 84352d55347d..99aa72e425e4 100644
--- a/tools/perf/tests/dlfilter-test.c
+++ b/tools/perf/tests/dlfilter-test.c
@@ -33,6 +33,7 @@
 #include "archinsn.h"
 #include "dlfilter.h"
 #include "tests.h"
+#include "util/sample.h"
 
 #define MAP_START 0x400000
 
diff --git a/tools/perf/tests/event_groups.c b/tools/perf/tests/event_groups.c
new file mode 100644
index 000000000000..029442b4e9c6
--- /dev/null
+++ b/tools/perf/tests/event_groups.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include "linux/perf_event.h"
+#include "tests.h"
+#include "debug.h"
+#include "pmu.h"
+#include "pmus.h"
+#include "header.h"
+#include "../perf-sys.h"
+
+/* hw: cycles, sw: context-switch, uncore: [arch dependent] */
+static int types[] = {0, 1, -1};
+static unsigned long configs[] = {0, 3, 0};
+
+#define NR_UNCORE_PMUS 5
+
+/* Uncore pmus that support more than 3 counters */
+static struct uncore_pmus {
+	const char *name;
+	__u64 config;
+} uncore_pmus[NR_UNCORE_PMUS] = {
+	{ "amd_l3", 0x0 },
+	{ "amd_df", 0x0 },
+	{ "uncore_imc_0", 0x1 },         /* Intel */
+	{ "core_imc", 0x318 },           /* PowerPC: core_imc/CPM_STCX_FIN/ */
+	{ "hv_24x7", 0x22000000003 },    /* PowerPC: hv_24x7/CPM_STCX_FIN/ */
+};
+
+static int event_open(int type, unsigned long config, int group_fd)
+{
+	struct perf_event_attr attr;
+
+	memset(&attr, 0, sizeof(struct perf_event_attr));
+	attr.type = type;
+	attr.size = sizeof(struct perf_event_attr);
+	attr.config = config;
+	/*
+	 * When creating an event group, typically the group leader is
+	 * initialized with disabled set to 1 and any child events are
+	 * initialized with disabled set to 0. Despite disabled being 0,
+	 * the child events will not start until the group leader is
+	 * enabled.
+	 */
+	attr.disabled = group_fd == -1 ? 1 : 0;
+
+	return sys_perf_event_open(&attr, -1, 0, group_fd, 0);
+}
+
+static int setup_uncore_event(void)
+{
+	struct perf_pmu *pmu;
+	int i, fd;
+
+	if (list_empty(&pmus))
+		perf_pmu__scan(NULL);
+
+	perf_pmus__for_each_pmu(pmu) {
+		for (i = 0; i < NR_UNCORE_PMUS; i++) {
+			if (!strcmp(uncore_pmus[i].name, pmu->name)) {
+				pr_debug("Using %s for uncore pmu event\n", pmu->name);
+				types[2] = pmu->type;
+				configs[2] = uncore_pmus[i].config;
+				/*
+				 * Check if the chosen uncore pmu event can be
+				 * used in the test. For example, incase of accessing
+				 * hv_24x7 pmu counters, partition should have
+				 * additional permissions. If not, event open will
+				 * fail. So check if the event open succeeds
+				 * before proceeding.
+				 */
+				fd = event_open(types[2], configs[2], -1);
+				if (fd < 0)
+					return -1;
+				close(fd);
+				return 0;
+			}
+		}
+	}
+	return -1;
+}
+
+static int run_test(int i, int j, int k)
+{
+	int erroneous = ((((1 << i) | (1 << j) | (1 << k)) & 5) == 5);
+	int group_fd, sibling_fd1, sibling_fd2;
+
+	group_fd = event_open(types[i], configs[i], -1);
+	if (group_fd == -1)
+		return -1;
+
+	sibling_fd1 = event_open(types[j], configs[j], group_fd);
+	if (sibling_fd1 == -1) {
+		close(group_fd);
+		return erroneous ? 0 : -1;
+	}
+
+	sibling_fd2 = event_open(types[k], configs[k], group_fd);
+	if (sibling_fd2 == -1) {
+		close(sibling_fd1);
+		close(group_fd);
+		return erroneous ? 0 : -1;
+	}
+
+	close(sibling_fd2);
+	close(sibling_fd1);
+	close(group_fd);
+	return erroneous ? -1 : 0;
+}
+
+static int test__event_groups(struct test_suite *text __maybe_unused, int subtest __maybe_unused)
+{
+	int i, j, k;
+	int ret;
+	int r;
+
+	ret = setup_uncore_event();
+	if (ret || types[2] == -1)
+		return TEST_SKIP;
+
+	ret = TEST_OK;
+	for (i = 0; i < 3; i++) {
+		for (j = 0; j < 3; j++) {
+			for (k = 0; k < 3; k++) {
+				r = run_test(i, j, k);
+				if (r)
+					ret = TEST_FAIL;
+
+				pr_debug("0x%x 0x%lx, 0x%x 0x%lx, 0x%x 0x%lx: %s\n",
+					 types[i], configs[i], types[j], configs[j],
+					 types[k], configs[k], r ? "Fail" : "Pass");
+			}
+		}
+	}
+	return ret;
+}
+
+DEFINE_SUITE("Event groups", event_groups);
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index 6512f5e22045..a9eb1ed6bd63 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -2,6 +2,7 @@
 #include "util/cputopo.h"
 #include "util/debug.h"
 #include "util/expr.h"
+#include "util/hashmap.h"
 #include "util/header.h"
 #include "util/smt.h"
 #include "tests.h"
@@ -130,12 +131,9 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
 			expr__find_ids("FOO + BAR + BAZ + BOZO", "FOO",
 					ctx) == 0);
 	TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 3);
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "BAR",
-						    (void **)&val_ptr));
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "BAZ",
-						    (void **)&val_ptr));
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "BOZO",
-						    (void **)&val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "BAR", &val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "BAZ", &val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "BOZO", &val_ptr));
 
 	expr__ctx_clear(ctx);
 	ctx->sctx.runtime = 3;
@@ -143,20 +141,16 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
 			expr__find_ids("EVENT1\\,param\\=?@ + EVENT2\\,param\\=?@",
 					NULL, ctx) == 0);
 	TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 2);
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT1,param=3@",
-						    (void **)&val_ptr));
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT2,param=3@",
-						    (void **)&val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT1,param=3@", &val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "EVENT2,param=3@", &val_ptr));
 
 	expr__ctx_clear(ctx);
 	TEST_ASSERT_VAL("find ids",
 			expr__find_ids("dash\\-event1 - dash\\-event2",
 				       NULL, ctx) == 0);
 	TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 2);
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "dash-event1",
-						    (void **)&val_ptr));
-	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "dash-event2",
-						    (void **)&val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "dash-event1", &val_ptr));
+	TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids, "dash-event2", &val_ptr));
 
 	/* Only EVENT1 or EVENT2 need be measured depending on the value of smt_on. */
 	{
@@ -174,7 +168,7 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
 		TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1);
 		TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids,
 							  smton ? "EVENT1" : "EVENT2",
-							  (void **)&val_ptr));
+							  &val_ptr));
 
 		expr__ctx_clear(ctx);
 		TEST_ASSERT_VAL("find ids",
@@ -183,7 +177,7 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
 		TEST_ASSERT_VAL("find ids", hashmap__size(ctx->ids) == 1);
 		TEST_ASSERT_VAL("find ids", hashmap__find(ctx->ids,
 							  corewide ? "EVENT1" : "EVENT2",
-							  (void **)&val_ptr));
+							  &val_ptr));
 
 	}
 	/* The expression is a constant 1.0 without needing to evaluate EVENT1. */
@@ -220,8 +214,7 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
 			expr__find_ids("source_count(EVENT1)",
 			NULL, ctx) == 0);
 	TEST_ASSERT_VAL("source count", hashmap__size(ctx->ids) == 1);
-	TEST_ASSERT_VAL("source count", hashmap__find(ctx->ids, "EVENT1",
-							(void **)&val_ptr));
+	TEST_ASSERT_VAL("source count", hashmap__find(ctx->ids, "EVENT1", &val_ptr));
 
 	expr__ctx_free(ctx);
 
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index da013e90a945..05e818a8bbad 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -29,7 +29,7 @@ endif
 PARALLEL_OPT=
 ifeq ($(SET_PARALLEL),1)
   ifeq ($(JOBS),)
-    cores := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null)
+    cores := $(shell (getconf _NPROCESSORS_ONLN || grep -E -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null)
     ifeq ($(cores),0)
       cores := 1
     endif
diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index 4c96829510c9..a0e88c496107 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c
@@ -33,7 +33,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 		int i;
 
 		perf_cpu_map__for_each_cpu(cpu, i, map)
-			set_bit(cpu.cpu, bm);
+			__set_bit(cpu.cpu, bm);
 	}
 
 	if (map)
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 8322fc2295fa..e68ca6229756 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -5,11 +5,13 @@
 #include <perf/cpumap.h>
 
 #include "debug.h"
+#include "event.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
 #include "tests.h"
 #include "util/mmap.h"
+#include "util/sample.h"
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index a7b2800652e4..888df8eca981 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -14,6 +14,7 @@
 #include "util/mmap.h"
 #include <errno.h>
 #include <perf/mmap.h>
+#include "util/sample.h"
 
 #ifndef O_DIRECTORY
 #define O_DIRECTORY    00200000
diff --git a/tools/perf/tests/openat-syscall.c b/tools/perf/tests/openat-syscall.c
index 7e05b8b5cc95..131b62271bfa 100644
--- a/tools/perf/tests/openat-syscall.c
+++ b/tools/perf/tests/openat-syscall.c
@@ -7,6 +7,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <unistd.h>
 #include "thread_map.h"
 #include "evsel.h"
 #include "debug.h"
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 459afdb256a1..71a5cb343311 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -20,6 +20,8 @@
 #define PERF_TP_SAMPLE_TYPE (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | \
 			     PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD)
 
+#ifdef HAVE_LIBTRACEEVENT
+
 #if defined(__s390x__)
 /* Return true if kvm module is available and loaded. Test this
  * and return success when trace point kvm_s390_create_vm
@@ -76,6 +78,7 @@ static int test__checkevent_tracepoint_multi(struct evlist *evlist)
 	}
 	return TEST_OK;
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 static int test__checkevent_raw(struct evlist *evlist)
 {
@@ -222,6 +225,7 @@ static int test__checkevent_breakpoint_rw(struct evlist *evlist)
 	return TEST_OK;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int test__checkevent_tracepoint_modifier(struct evlist *evlist)
 {
 	struct evsel *evsel = evlist__first(evlist);
@@ -252,6 +256,7 @@ test__checkevent_tracepoint_multi_modifier(struct evlist *evlist)
 
 	return test__checkevent_tracepoint_multi(evlist);
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 static int test__checkevent_raw_modifier(struct evlist *evlist)
 {
@@ -453,6 +458,7 @@ static int test__checkevent_pmu(struct evlist *evlist)
 	return TEST_OK;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int test__checkevent_list(struct evlist *evlist)
 {
 	struct evsel *evsel = evlist__first(evlist);
@@ -491,6 +497,7 @@ static int test__checkevent_list(struct evlist *evlist)
 
 	return TEST_OK;
 }
+#endif
 
 static int test__checkevent_pmu_name(struct evlist *evlist)
 {
@@ -762,6 +769,7 @@ static int test__group2(struct evlist *evlist)
 	return TEST_OK;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int test__group3(struct evlist *evlist __maybe_unused)
 {
 	struct evsel *evsel, *leader;
@@ -853,6 +861,7 @@ static int test__group3(struct evlist *evlist __maybe_unused)
 
 	return TEST_OK;
 }
+#endif
 
 static int test__group4(struct evlist *evlist __maybe_unused)
 {
@@ -1460,6 +1469,7 @@ static int test__sym_event_dc(struct evlist *evlist)
 	return TEST_OK;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int count_tracepoints(void)
 {
 	struct dirent *events_ent;
@@ -1513,6 +1523,7 @@ static int test__all_tracepoints(struct evlist *evlist)
 
 	return test__checkevent_tracepoint_multi(evlist);
 }
+#endif /* HAVE_LIBTRACEVENT */
 
 static int test__hybrid_hw_event_with_pmu(struct evlist *evlist)
 {
@@ -1642,6 +1653,7 @@ struct evlist_test {
 };
 
 static const struct evlist_test test__events[] = {
+#ifdef HAVE_LIBTRACEEVENT
 	{
 		.name  = "syscalls:sys_enter_openat",
 		.check = test__checkevent_tracepoint,
@@ -1652,6 +1664,7 @@ static const struct evlist_test test__events[] = {
 		.check = test__checkevent_tracepoint_multi,
 		/* 1 */
 	},
+#endif
 	{
 		.name  = "r1a",
 		.check = test__checkevent_raw,
@@ -1702,6 +1715,7 @@ static const struct evlist_test test__events[] = {
 		.check = test__checkevent_breakpoint_w,
 		/* 1 */
 	},
+#ifdef HAVE_LIBTRACEEVENT
 	{
 		.name  = "syscalls:sys_enter_openat:k",
 		.check = test__checkevent_tracepoint_modifier,
@@ -1712,6 +1726,7 @@ static const struct evlist_test test__events[] = {
 		.check = test__checkevent_tracepoint_multi_modifier,
 		/* 3 */
 	},
+#endif
 	{
 		.name  = "r1a:kp",
 		.check = test__checkevent_raw_modifier,
@@ -1757,11 +1772,13 @@ static const struct evlist_test test__events[] = {
 		.check = test__checkevent_breakpoint_w_modifier,
 		/* 2 */
 	},
+#ifdef HAVE_LIBTRACEEVENT
 	{
 		.name  = "r1,syscalls:sys_enter_openat:k,1:1:hp",
 		.check = test__checkevent_list,
 		/* 3 */
 	},
+#endif
 	{
 		.name  = "instructions:G",
 		.check = test__checkevent_exclude_host_modifier,
@@ -1792,11 +1809,13 @@ static const struct evlist_test test__events[] = {
 		.check = test__group2,
 		/* 9 */
 	},
+#ifdef HAVE_LIBTRACEEVENT
 	{
 		.name  = "group1{syscalls:sys_enter_openat:H,cycles:kppp},group2{cycles,1:3}:G,instructions:u",
 		.check = test__group3,
 		/* 0 */
 	},
+#endif
 	{
 		.name  = "{cycles:u,instructions:kp}:p",
 		.check = test__group4,
@@ -1807,11 +1826,13 @@ static const struct evlist_test test__events[] = {
 		.check = test__group5,
 		/* 2 */
 	},
+#ifdef HAVE_LIBTRACEEVENT
 	{
 		.name  = "*:*",
 		.check = test__all_tracepoints,
 		/* 3 */
 	},
+#endif
 	{
 		.name  = "{cycles,cache-misses:G}:H",
 		.check = test__group_gh1,
@@ -1867,7 +1888,7 @@ static const struct evlist_test test__events[] = {
 		.check = test__checkevent_breakpoint_len_rw_modifier,
 		/* 4 */
 	},
-#if defined(__s390x__)
+#if defined(__s390x__) && defined(HAVE_LIBTRACEEVENT)
 	{
 		.name  = "kvm-s390:kvm_s390_create_vm",
 		.check = test__checkevent_tracepoint,
@@ -2237,6 +2258,19 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest
 			pr_debug("Test PMU event failed for '%s'", name);
 			ret = combine_test_results(ret, test_ret);
 		}
+		/*
+		 * Names containing '-' are recognized as prefixes and suffixes
+		 * due to '-' being a legacy PMU separator. This fails when the
+		 * prefix or suffix collides with an existing legacy token. For
+		 * example, branch-brs has a prefix (branch) that collides with
+		 * a PE_NAME_CACHE_TYPE token causing a parse error as a suffix
+		 * isn't expected after this. As event names in the config
+		 * slashes are allowed a '-' in the name we check this works
+		 * above.
+		 */
+		if (strchr(ent->d_name, '-'))
+			continue;
+
 		snprintf(name, sizeof(name), "%s:u,cpu/event=%s/u", ent->d_name, ent->d_name);
 		e.name  = name;
 		e.check = test__checkevent_pmu_events_mix;
diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c
index 68f5a2a03242..21b7ac00d798 100644
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -103,7 +103,7 @@ static int __compute_metric(const char *name, struct value *vals,
 	if (err)
 		goto out;
 
-	err = evlist__alloc_stats(evlist, false);
+	err = evlist__alloc_stats(/*config=*/NULL, evlist, /*alloc_raw=*/false);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/parse-no-sample-id-all.c b/tools/perf/tests/parse-no-sample-id-all.c
index d62e31595ab2..202f0a9a6796 100644
--- a/tools/perf/tests/parse-no-sample-id-all.c
+++ b/tools/perf/tests/parse-no-sample-id-all.c
@@ -8,6 +8,7 @@
 #include "evlist.h"
 #include "header.h"
 #include "debug.h"
+#include "util/sample.h"
 
 static int process_event(struct evlist **pevlist, union perf_event *event)
 {
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 7aa946aa886d..1c4feec1adff 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -5,12 +5,14 @@
 
 #include <sched.h>
 #include <perf/mmap.h>
+#include "event.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "debug.h"
 #include "record.h"
 #include "tests.h"
 #include "util/mmap.h"
+#include "util/sample.h"
 
 static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t *maskp)
 {
diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c
index c3aaa1ddff29..efcd71c2738a 100644
--- a/tools/perf/tests/perf-time-to-tsc.c
+++ b/tools/perf/tests/perf-time-to-tsc.c
@@ -20,6 +20,7 @@
 #include "tsc.h"
 #include "mmap.h"
 #include "tests.h"
+#include "util/sample.h"
 
 /*
  * Except x86_64/i386 and Arm64, other archs don't support TSC in perf.  Just
diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c
index 097e05c796ab..e36d8b1610d4 100644
--- a/tools/perf/tests/pmu-events.c
+++ b/tools/perf/tests/pmu-events.c
@@ -12,6 +12,7 @@
 #include <perf/evlist.h>
 #include "util/evlist.h"
 #include "util/expr.h"
+#include "util/hashmap.h"
 #include "util/parse-events.h"
 #include "metricgroup.h"
 #include "stat.h"
@@ -889,7 +890,7 @@ static int test__parsing_callback(const struct pmu_event *pe, const struct pmu_e
 		goto out_err;
 	}
 
-	err = evlist__alloc_stats(evlist, false);
+	err = evlist__alloc_stats(/*config=*/NULL, evlist, /*alloc_raw=*/false);
 	if (err)
 		goto out_err;
 	/*
@@ -986,10 +987,10 @@ static int metric_parse_fake(const char *str)
 	 */
 	i = 1;
 	hashmap__for_each_entry(ctx->ids, cur, bkt)
-		expr__add_id_val(ctx, strdup(cur->key), i++);
+		expr__add_id_val(ctx, strdup(cur->pkey), i++);
 
 	hashmap__for_each_entry(ctx->ids, cur, bkt) {
-		if (check_parse_fake(cur->key)) {
+		if (check_parse_fake(cur->pkey)) {
 			pr_err("check_parse_fake failed\n");
 			goto out;
 		}
@@ -1003,7 +1004,7 @@ static int metric_parse_fake(const char *str)
 		 */
 		i = 1024;
 		hashmap__for_each_entry(ctx->ids, cur, bkt)
-			expr__add_id_val(ctx, strdup(cur->key), i--);
+			expr__add_id_val(ctx, strdup(cur->pkey), i--);
 		if (expr__parse(&result, ctx, str)) {
 			pr_err("expr__parse failed\n");
 			ret = -1;
diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 20930dd48ee0..927c7f0cc4cc 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -13,7 +13,7 @@
 #include "evsel.h"
 #include "debug.h"
 #include "util/synthetic-events.h"
-#include "util/trace-event.h"
+#include "util/util.h"
 
 #include "tests.h"
 
@@ -117,7 +117,7 @@ static bool samples_same(const struct perf_sample *s1,
 		COMP(branch_stack->hw_idx);
 		for (i = 0; i < s1->branch_stack->nr; i++) {
 			if (needs_swap)
-				return ((tep_is_bigendian()) ?
+				return ((host_is_bigendian()) ?
 					(FLAG(s2).value == BS_EXPECTED_BE) :
 					(FLAG(s2).value == BS_EXPECTED_LE));
 			else
diff --git a/tools/perf/tests/shell/lib/probe_vfs_getname.sh b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
index b616d42bd19d..ed0a3972c4c8 100644
--- a/tools/perf/tests/shell/lib/probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/lib/probe_vfs_getname.sh
@@ -12,13 +12,13 @@ cleanup_probe_vfs_getname() {
 add_probe_vfs_getname() {
 	local verbose=$1
 	if [ $had_vfs_getname -eq 1 ] ; then
-		line=$(perf probe -L getname_flags 2>&1 | egrep 'result.*=.*filename;' | sed -r 's/[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*/\1/')
+		line=$(perf probe -L getname_flags 2>&1 | grep -E 'result.*=.*filename;' | sed -r 's/[[:space:]]+([[:digit:]]+)[[:space:]]+result->uptr.*/\1/')
 		perf probe -q       "vfs_getname=getname_flags:${line} pathname=result->name:string" || \
 		perf probe $verbose "vfs_getname=getname_flags:${line} pathname=filename:ustring"
 	fi
 }
 
 skip_if_no_debuginfo() {
-	add_probe_vfs_getname -v 2>&1 | egrep -q "^(Failed to find the path for the kernel|Debuginfo-analysis is not supported)|(file has no debug information)" && return 2
+	add_probe_vfs_getname -v 2>&1 | grep -E -q "^(Failed to find the path for the kernel|Debuginfo-analysis is not supported)|(file has no debug information)" && return 2
 	return 1
 }
diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh
index 04bf604e3c6f..cc9ceb9e19ca 100755
--- a/tools/perf/tests/shell/lock_contention.sh
+++ b/tools/perf/tests/shell/lock_contention.sh
@@ -53,7 +53,7 @@ test_bpf()
 
 	if ! perf lock con -b true > /dev/null 2>&1 ; then
 		echo "[Skip] No BPF support"
-		exit
+		return
 	fi
 
 	# the perf lock contention output goes to the stderr
@@ -65,9 +65,70 @@ test_bpf()
 	fi
 }
 
+test_record_concurrent()
+{
+	echo "Testing perf lock record and perf lock contention at the same time"
+	perf lock record -o- -- perf bench sched messaging 2> /dev/null | \
+	perf lock contention -i- -E 1 -q 2> ${result}
+	if [ $(cat "${result}" | wc -l) != "1" ]; then
+		echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l)
+		err=1
+		exit
+	fi
+}
+
+test_aggr_task()
+{
+	echo "Testing perf lock contention --threads"
+	perf lock contention -i ${perfdata} -t -E 1 -q 2> ${result}
+	if [ $(cat "${result}" | wc -l) != "1" ]; then
+		echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l)
+		err=1
+		exit
+	fi
+
+	if ! perf lock con -b true > /dev/null 2>&1 ; then
+		return
+	fi
+
+	# the perf lock contention output goes to the stderr
+	perf lock con -a -b -t -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result}
+	if [ $(cat "${result}" | wc -l) != "1" ]; then
+		echo "[Fail] BPF result count is not 1:" $(cat "${result}" | wc -l)
+		err=1
+		exit
+	fi
+}
+
+test_aggr_addr()
+{
+	echo "Testing perf lock contention --lock-addr"
+	perf lock contention -i ${perfdata} -l -E 1 -q 2> ${result}
+	if [ $(cat "${result}" | wc -l) != "1" ]; then
+		echo "[Fail] Recorded result count is not 1:" $(cat "${result}" | wc -l)
+		err=1
+		exit
+	fi
+
+	if ! perf lock con -b true > /dev/null 2>&1 ; then
+		return
+	fi
+
+	# the perf lock contention output goes to the stderr
+	perf lock con -a -b -t -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result}
+	if [ $(cat "${result}" | wc -l) != "1" ]; then
+		echo "[Fail] BPF result count is not 1:" $(cat "${result}" | wc -l)
+		err=1
+		exit
+	fi
+}
+
 check
 
 test_record
 test_bpf
+test_record_concurrent
+test_aggr_task
+test_aggr_addr
 
 exit ${err}
diff --git a/tools/perf/tests/shell/pipe_test.sh b/tools/perf/tests/shell/pipe_test.sh
index 1b32b4f28391..8dd115dd35a7 100755
--- a/tools/perf/tests/shell/pipe_test.sh
+++ b/tools/perf/tests/shell/pipe_test.sh
@@ -2,68 +2,33 @@
 # perf pipe recording and injection test
 # SPDX-License-Identifier: GPL-2.0
 
-# skip if there's no compiler
-if ! [ -x "$(command -v cc)" ]; then
-	echo "failed: no compiler, install gcc"
-	exit 2
-fi
-
-file=$(mktemp /tmp/test.file.XXXXXX)
 data=$(mktemp /tmp/perf.data.XXXXXX)
+prog="perf test -w noploop"
+task="perf"
+sym="noploop"
 
-cat <<EOF | cc -o ${file} -x c -
-#include <signal.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-volatile int done;
-
-void sigalrm(int sig) {
-	done = 1;
-}
-
-__attribute__((noinline)) void noploop(void) {
-	while (!done)
-		continue;
-}
-
-int main(int argc, char *argv[]) {
-	int sec = 1;
-
-	if (argc > 1)
-		sec = atoi(argv[1]);
-
-	signal(SIGALRM, sigalrm);
-	alarm(sec);
-
-	noploop();
-	return 0;
-}
-EOF
-
-
-if ! perf record -e task-clock:u -o - ${file} | perf report -i - --task | grep test.file; then
+if ! perf record -e task-clock:u -o - ${prog} | perf report -i - --task | grep ${task}; then
 	echo "cannot find the test file in the perf report"
 	exit 1
 fi
 
-if ! perf record -e task-clock:u -o - ${file} | perf inject -b | perf report -i - | grep noploop; then
+if ! perf record -e task-clock:u -o - ${prog} | perf inject -b | perf report -i - | grep ${sym}; then
 	echo "cannot find noploop function in pipe #1"
 	exit 1
 fi
 
-perf record -e task-clock:u -o - ${file} | perf inject -b -o ${data}
-if ! perf report -i ${data} | grep noploop; then
+perf record -e task-clock:u -o - ${prog} | perf inject -b -o ${data}
+if ! perf report -i ${data} | grep ${sym}; then
 	echo "cannot find noploop function in pipe #2"
 	exit 1
 fi
 
-perf record -e task-clock:u -o ${data} ${file}
-if ! perf inject -b -i ${data} | perf report -i - | grep noploop; then
+perf record -e task-clock:u -o ${data} ${prog}
+if ! perf inject -b -i ${data} | perf report -i - | grep ${sym}; then
 	echo "cannot find noploop function in pipe #3"
 	exit 1
 fi
 
 
-rm -f ${file} ${data} ${data}.old
+rm -f ${data} ${data}.old
 exit 0
diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
index f12a4e217968..34c400ccbe04 100755
--- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
+++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
@@ -64,7 +64,7 @@ trace_libc_inet_pton_backtrace() {
 	while read line <&3 && read -r pattern <&4; do
 		[ -z "$pattern" ] && break
 		echo $line
-		echo "$line" | egrep -q "$pattern"
+		echo "$line" | grep -E -q "$pattern"
 		if [ $? -ne 0 ] ; then
 			printf "FAIL: expected backtrace entry \"%s\" got \"%s\"\n" "$pattern" "$line"
 			return 1
diff --git a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
index 8d9c04e450ae..7f83b2715b9a 100755
--- a/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/record+script_probe_vfs_getname.sh
@@ -23,7 +23,7 @@ record_open_file() {
 perf_script_filenames() {
 	echo "Looking at perf.data file for vfs_getname records for the file we touched:"
 	perf script -i ${perfdata} | \
-	egrep " +touch +[0-9]+ +\[[0-9]+\] +[0-9]+\.[0-9]+: +probe:vfs_getname[_0-9]*: +\([[:xdigit:]]+\) +pathname=\"${file}\""
+	grep -E " +touch +[0-9]+ +\[[0-9]+\] +[0-9]+\.[0-9]+: +probe:vfs_getname[_0-9]*: +\([[:xdigit:]]+\) +pathname=\"${file}\""
 }
 
 add_probe_vfs_getname || skip_if_no_debuginfo
diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 301f95427159..4fbc74805d52 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -4,67 +4,89 @@
 
 set -e
 
+shelldir=$(dirname "$0")
+. "${shelldir}"/lib/waiting.sh
+
 err=0
 perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+testprog="perf test -w thloop"
+testsym="test_loop"
 
 cleanup() {
-  rm -f ${perfdata}
-  rm -f ${perfdata}.old
-  trap - exit term int
+  rm -rf "${perfdata}"
+  rm -rf "${perfdata}".old
+
+  trap - EXIT TERM INT
 }
 
 trap_cleanup() {
   cleanup
   exit 1
 }
-trap trap_cleanup exit term int
+trap trap_cleanup EXIT TERM INT
 
 test_per_thread() {
   echo "Basic --per-thread mode test"
-  if ! perf record -e instructions:u -o ${perfdata} --quiet true 2> /dev/null
+  if ! perf record -o /dev/null --quiet ${testprog} 2> /dev/null
   then
-    echo "Per-thread record [Skipped instructions:u not supported]"
-    if [ $err -ne 1 ]
-    then
-      err=2
-    fi
+    echo "Per-thread record [Skipped event not supported]"
     return
   fi
-  if ! perf record -e instructions:u --per-thread -o ${perfdata} true 2> /dev/null
+  if ! perf record --per-thread -o "${perfdata}" ${testprog} 2> /dev/null
   then
-    echo "Per-thread record of instructions:u [Failed]"
+    echo "Per-thread record [Failed record]"
     err=1
     return
   fi
-  if ! perf report -i ${perfdata} -q | egrep -q true
+  if ! perf report -i "${perfdata}" -q | grep -q "${testsym}"
   then
     echo "Per-thread record [Failed missing output]"
     err=1
     return
   fi
+
+  # run the test program in background (for 30 seconds)
+  ${testprog} 30 &
+  TESTPID=$!
+
+  rm -f "${perfdata}"
+
+  wait_for_threads ${TESTPID} 2
+  perf record -p "${TESTPID}" --per-thread -o "${perfdata}" sleep 1 2> /dev/null
+  kill ${TESTPID}
+
+  if [ ! -e "${perfdata}" ]
+  then
+    echo "Per-thread record [Failed record -p]"
+    err=1
+    return
+  fi
+  if ! perf report -i "${perfdata}" -q | grep -q "${testsym}"
+  then
+    echo "Per-thread record [Failed -p missing output]"
+    err=1
+    return
+  fi
+
   echo "Basic --per-thread mode test [Success]"
 }
 
 test_register_capture() {
   echo "Register capture test"
-  if ! perf list | egrep -q 'br_inst_retired.near_call'
+  if ! perf list | grep -q 'br_inst_retired.near_call'
   then
-    echo "Register capture test [Skipped missing instruction]"
-    if [ $err -ne 1 ]
-    then
-      err=2
-    fi
+    echo "Register capture test [Skipped missing event]"
     return
   fi
-  if ! perf record --intr-regs=\? 2>&1 | egrep -q 'available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10 R11 R12 R13 R14 R15'
+  if ! perf record --intr-regs=\? 2>&1 | grep -q 'available registers: AX BX CX DX SI DI BP SP IP FLAGS CS SS R8 R9 R10 R11 R12 R13 R14 R15'
   then
     echo "Register capture test [Skipped missing registers]"
     return
   fi
-  if ! perf record -o - --intr-regs=di,r8,dx,cx -e br_inst_retired.near_call:p \
-    -c 1000 --per-thread true 2> /dev/null \
+  if ! perf record -o - --intr-regs=di,r8,dx,cx -e br_inst_retired.near_call \
+    -c 1000 --per-thread ${testprog} 2> /dev/null \
     | perf script -F ip,sym,iregs -i - 2> /dev/null \
-    | egrep -q "DI:"
+    | grep -q "DI:"
   then
     echo "Register capture test [Failed missing output]"
     err=1
@@ -73,8 +95,69 @@ test_register_capture() {
   echo "Register capture test [Success]"
 }
 
+test_system_wide() {
+  echo "Basic --system-wide mode test"
+  if ! perf record -aB --synth=no -o "${perfdata}" ${testprog} 2> /dev/null
+  then
+    echo "System-wide record [Skipped not supported]"
+    return
+  fi
+  if ! perf report -i "${perfdata}" -q | grep -q "${testsym}"
+  then
+    echo "System-wide record [Failed missing output]"
+    err=1
+    return
+  fi
+  if ! perf record -aB --synth=no -e cpu-clock,cs --threads=cpu \
+    -o "${perfdata}" ${testprog} 2> /dev/null
+  then
+    echo "System-wide record [Failed record --threads option]"
+    err=1
+    return
+  fi
+  if ! perf report -i "${perfdata}" -q | grep -q "${testsym}"
+  then
+    echo "System-wide record [Failed --threads missing output]"
+    err=1
+    return
+  fi
+  echo "Basic --system-wide mode test [Success]"
+}
+
+test_workload() {
+  echo "Basic target workload test"
+  if ! perf record -o "${perfdata}" ${testprog} 2> /dev/null
+  then
+    echo "Workload record [Failed record]"
+    err=1
+    return
+  fi
+  if ! perf report -i "${perfdata}" -q | grep -q "${testsym}"
+  then
+    echo "Workload record [Failed missing output]"
+    err=1
+    return
+  fi
+  if ! perf record -e cpu-clock,cs --threads=package \
+    -o "${perfdata}" ${testprog} 2> /dev/null
+  then
+    echo "Workload record [Failed record --threads option]"
+    err=1
+    return
+  fi
+  if ! perf report -i "${perfdata}" -q | grep -q "${testsym}"
+  then
+    echo "Workload record [Failed --threads missing output]"
+    err=1
+    return
+  fi
+  echo "Basic target workload test [Success]"
+}
+
 test_per_thread
 test_register_capture
+test_system_wide
+test_workload
 
 cleanup
 exit $err
diff --git a/tools/perf/tests/shell/record_offcpu.sh b/tools/perf/tests/shell/record_offcpu.sh
index d2eba583a2ac..e01973d4e0fb 100755
--- a/tools/perf/tests/shell/record_offcpu.sh
+++ b/tools/perf/tests/shell/record_offcpu.sh
@@ -51,7 +51,7 @@ test_offcpu_basic() {
     err=1
     return
   fi
-  if ! perf report -i ${perfdata} -q --percent-limit=90 | egrep -q sleep
+  if ! perf report -i ${perfdata} -q --percent-limit=90 | grep -E -q sleep
   then
     echo "Basic off-cpu test [Failed missing output]"
     err=1
diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 26a51b48aee4..2c1d3f704995 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -7,7 +7,7 @@ set -e
 err=0
 test_default_stat() {
   echo "Basic stat command test"
-  if ! perf stat true 2>&1 | egrep -q "Performance counter stats for 'true':"
+  if ! perf stat true 2>&1 | grep -E -q "Performance counter stats for 'true':"
   then
     echo "Basic stat command test [Failed]"
     err=1
@@ -19,7 +19,7 @@ test_default_stat() {
 test_stat_record_report() {
   echo "stat record and report test"
   if ! perf stat record -o - true | perf stat report -i - 2>&1 | \
-    egrep -q "Performance counter stats for 'pipe':"
+    grep -E -q "Performance counter stats for 'pipe':"
   then
     echo "stat record and report test [Failed]"
     err=1
@@ -55,13 +55,13 @@ test_topdown_groups() {
     echo "Topdown event group test [Skipped event parsing failed]"
     return
   fi
-  if perf stat -e '{slots,topdown-retiring}' true 2>&1 | egrep -q "<not supported>"
+  if perf stat -e '{slots,topdown-retiring}' true 2>&1 | grep -E -q "<not supported>"
   then
     echo "Topdown event group test [Failed events not supported]"
     err=1
     return
   fi
-  if perf stat -e '{topdown-retiring,slots}' true 2>&1 | egrep -q "<not supported>"
+  if perf stat -e '{topdown-retiring,slots}' true 2>&1 | grep -E -q "<not supported>"
   then
     echo "Topdown event group test [Failed slots not reordered first]"
     err=1
@@ -82,7 +82,7 @@ test_topdown_weak_groups() {
     return
   fi
   group_needs_break="{slots,topdown-bad-spec,topdown-be-bound,topdown-fe-bound,topdown-retiring,branch-instructions,branch-misses,bus-cycles,cache-misses,cache-references,cpu-cycles,instructions,mem-loads,mem-stores,ref-cycles,cache-misses,cache-references}:W"
-  if perf stat --no-merge -e "$group_needs_break" true 2>&1 | egrep -q "<not supported>"
+  if perf stat --no-merge -e "$group_needs_break" true 2>&1 | grep -E -q "<not supported>"
   then
     echo "Topdown weak groups test [Failed events not supported]"
     err=1
diff --git a/tools/perf/tests/shell/test_arm_callgraph_fp.sh b/tools/perf/tests/shell/test_arm_callgraph_fp.sh
index ec108d45d3c6..e61d8deaa0c4 100755
--- a/tools/perf/tests/shell/test_arm_callgraph_fp.sh
+++ b/tools/perf/tests/shell/test_arm_callgraph_fp.sh
@@ -4,44 +4,16 @@
 
 lscpu | grep -q "aarch64" || exit 2
 
-if ! [ -x "$(command -v cc)" ]; then
-	echo "failed: no compiler, install gcc"
-	exit 2
-fi
-
 PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
-TEST_PROGRAM_SOURCE=$(mktemp /tmp/test_program.XXXXX.c)
-TEST_PROGRAM=$(mktemp /tmp/test_program.XXXXX)
+TEST_PROGRAM="perf test -w leafloop"
 
 cleanup_files()
 {
 	rm -f $PERF_DATA
-	rm -f $TEST_PROGRAM_SOURCE
-	rm -f $TEST_PROGRAM
 }
 
 trap cleanup_files exit term int
 
-cat << EOF > $TEST_PROGRAM_SOURCE
-int a = 0;
-void leaf(void) {
-  for (;;)
-    a += a;
-}
-void parent(void) {
-  leaf();
-}
-int main(void) {
-  parent();
-  return 0;
-}
-EOF
-
-echo " + Compiling test program ($TEST_PROGRAM)..."
-
-CFLAGS="-g -O0 -fno-inline -fno-omit-frame-pointer"
-cc $CFLAGS $TEST_PROGRAM_SOURCE -o $TEST_PROGRAM || exit 1
-
 # Add a 1 second delay to skip samples that are not in the leaf() function
 perf record -o $PERF_DATA --call-graph fp -e cycles//u -D 1000 --user-callchains -- $TEST_PROGRAM 2> /dev/null &
 PID=$!
@@ -58,11 +30,11 @@ wait $PID
 # program
 # 	728 leaf
 # 	753 parent
-# 	76c main
+# 	76c leafloop
 # ...
 
 perf script -i $PERF_DATA -F comm,ip,sym | head -n4
 perf script -i $PERF_DATA -F comm,ip,sym | head -n4 | \
 	awk '{ if ($2 != "") sym[i++] = $2 } END { if (sym[0] != "leaf" ||
 						       sym[1] != "parent" ||
-						       sym[2] != "main") exit 1 }'
+						       sym[2] != "leafloop") exit 1 }'
diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh
index daad786cf48d..565ce525c40b 100755
--- a/tools/perf/tests/shell/test_arm_coresight.sh
+++ b/tools/perf/tests/shell/test_arm_coresight.sh
@@ -49,7 +49,7 @@ perf_script_branch_samples() {
 	#   touch  6512          1         branches:u:      ffffb22082e0 strcmp+0xa0 (/lib/aarch64-linux-gnu/ld-2.27.so)
 	#   touch  6512          1         branches:u:      ffffb2208320 strcmp+0xe0 (/lib/aarch64-linux-gnu/ld-2.27.so)
 	perf script -F,-time -i ${perfdata} 2>&1 | \
-		egrep " +$1 +[0-9]+ .* +branches:(.*:)? +" > /dev/null 2>&1
+		grep -E " +$1 +[0-9]+ .* +branches:(.*:)? +" > /dev/null 2>&1
 }
 
 perf_report_branch_samples() {
@@ -60,7 +60,7 @@ perf_report_branch_samples() {
 	#    7.71%     7.71%  touch    libc-2.27.so      [.] getenv
 	#    2.59%     2.59%  touch    ld-2.27.so        [.] strcmp
 	perf report --stdio -i ${perfdata} 2>&1 | \
-		egrep " +[0-9]+\.[0-9]+% +[0-9]+\.[0-9]+% +$1 " > /dev/null 2>&1
+		grep -E " +[0-9]+\.[0-9]+% +[0-9]+\.[0-9]+% +$1 " > /dev/null 2>&1
 }
 
 perf_report_instruction_samples() {
@@ -71,7 +71,7 @@ perf_report_instruction_samples() {
 	#    5.80%  touch    libc-2.27.so   [.] getenv
 	#    4.35%  touch    ld-2.27.so     [.] _dl_fixup
 	perf report --itrace=i20i --stdio -i ${perfdata} 2>&1 | \
-		egrep " +[0-9]+\.[0-9]+% +$1" > /dev/null 2>&1
+		grep -E " +[0-9]+\.[0-9]+% +$1" > /dev/null 2>&1
 }
 
 arm_cs_report() {
@@ -87,7 +87,7 @@ is_device_sink() {
 	# If the node of "enable_sink" is existed under the device path, this
 	# means the device is a sink device.  Need to exclude 'tpiu' since it
 	# cannot support perf PMU.
-	echo "$1" | egrep -q -v "tpiu"
+	echo "$1" | grep -E -q -v "tpiu"
 
 	if [ $? -eq 0 -a -e "$1/enable_sink" ]; then
 
diff --git a/tools/perf/tests/shell/test_arm_spe.sh b/tools/perf/tests/shell/test_arm_spe.sh
index 0d47479adba8..aa094d71f5b4 100755
--- a/tools/perf/tests/shell/test_arm_spe.sh
+++ b/tools/perf/tests/shell/test_arm_spe.sh
@@ -9,7 +9,7 @@
 # German Gomez <german.gomez@arm.com>, 2021
 
 skip_if_no_arm_spe_event() {
-	perf list | egrep -q 'arm_spe_[0-9]+//' && return 0
+	perf list | grep -E -q 'arm_spe_[0-9]+//' && return 0
 
 	# arm_spe event doesn't exist
 	return 2
@@ -51,7 +51,7 @@ perf_script_samples() {
 	#	dd  3048 [002]          1    tlb-access:      ffffaa64999c __GI___libc_write+0x3c (/lib/aarch64-linux-gnu/libc-2.27.so)
 	#	dd  3048 [002]          1        memory:      ffffaa64999c __GI___libc_write+0x3c (/lib/aarch64-linux-gnu/libc-2.27.so)
 	perf script -F,-time -i ${perfdata} 2>&1 | \
-		egrep " +$1 +[0-9]+ .* +${events}:(.*:)? +" > /dev/null 2>&1
+		grep -E " +$1 +[0-9]+ .* +${events}:(.*:)? +" > /dev/null 2>&1
 }
 
 perf_report_samples() {
@@ -62,7 +62,7 @@ perf_report_samples() {
 	#    7.71%     7.71%  dd    libc-2.27.so      [.] getenv
 	#    2.59%     2.59%  dd    ld-2.27.so        [.] strcmp
 	perf report --stdio -i ${perfdata} 2>&1 | \
-		egrep " +[0-9]+\.[0-9]+% +[0-9]+\.[0-9]+% +$1 " > /dev/null 2>&1
+		grep -E " +[0-9]+\.[0-9]+% +[0-9]+\.[0-9]+% +$1 " > /dev/null 2>&1
 }
 
 arm_spe_snapshot_test() {
diff --git a/tools/perf/tests/shell/test_arm_spe_fork.sh b/tools/perf/tests/shell/test_arm_spe_fork.sh
index c920d3583d30..fad361675a1d 100755
--- a/tools/perf/tests/shell/test_arm_spe_fork.sh
+++ b/tools/perf/tests/shell/test_arm_spe_fork.sh
@@ -5,20 +5,13 @@
 # German Gomez <german.gomez@arm.com>, 2022
 
 skip_if_no_arm_spe_event() {
-	perf list | egrep -q 'arm_spe_[0-9]+//' && return 0
+	perf list | grep -E -q 'arm_spe_[0-9]+//' && return 0
 	return 2
 }
 
 skip_if_no_arm_spe_event || exit 2
 
-# skip if there's no compiler
-if ! [ -x "$(command -v cc)" ]; then
-	echo "failed: no compiler, install gcc"
-	exit 2
-fi
-
-TEST_PROGRAM_SOURCE=$(mktemp /tmp/__perf_test.program.XXXXX.c)
-TEST_PROGRAM=$(mktemp /tmp/__perf_test.program.XXXXX)
+TEST_PROGRAM="perf test -w sqrtloop 10"
 PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
 PERF_RECORD_LOG=$(mktemp /tmp/__perf_test.log.XXXXX)
 
@@ -27,43 +20,10 @@ cleanup_files()
 	echo "Cleaning up files..."
 	rm -f ${PERF_RECORD_LOG}
 	rm -f ${PERF_DATA}
-	rm -f ${TEST_PROGRAM_SOURCE}
-	rm -f ${TEST_PROGRAM}
 }
 
 trap cleanup_files exit term int
 
-# compile test program
-cat << EOF > $TEST_PROGRAM_SOURCE
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/wait.h>
-
-int workload() {
-  while (1)
-    sqrt(rand());
-  return 0;
-}
-
-int main() {
-  switch (fork()) {
-    case 0:
-      return workload();
-    case -1:
-      return 1;
-    default:
-      wait(NULL);
-  }
-  return 0;
-}
-EOF
-
-echo "Compiling test program..."
-CFLAGS="-lm"
-cc $TEST_PROGRAM_SOURCE $CFLAGS -o $TEST_PROGRAM || exit 1
-
 echo "Recording workload..."
 perf record -o ${PERF_DATA} -e arm_spe/period=65536/ -vvv -- $TEST_PROGRAM > ${PERF_RECORD_LOG} 2>&1 &
 PERFPID=$!
@@ -78,8 +38,6 @@ echo Log lines after 1 second = $log1
 
 kill $PERFPID
 wait $PERFPID
-# test program may leave an orphan process running the workload
-killall $(basename $TEST_PROGRAM)
 
 if [ "$log0" = "$log1" ];
 then
diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh
index d7ff5c4b4da4..59195eb80052 100755
--- a/tools/perf/tests/shell/test_brstack.sh
+++ b/tools/perf/tests/shell/test_brstack.sh
@@ -4,13 +4,6 @@
 # SPDX-License-Identifier: GPL-2.0
 # German Gomez <german.gomez@arm.com>, 2022
 
-# we need a C compiler to build the test programs
-# so bail if none is found
-if ! [ -x "$(command -v cc)" ]; then
-	echo "failed: no compiler, install gcc"
-	exit 2
-fi
-
 # skip the test if the hardware doesn't support branch stack sampling
 # and if the architecture doesn't support filter types: any,save_type,u
 if ! perf record -o- --no-buildid --branch-filter any,save_type,u -- true > /dev/null 2>&1 ; then
@@ -19,6 +12,7 @@ if ! perf record -o- --no-buildid --branch-filter any,save_type,u -- true > /dev
 fi
 
 TMPDIR=$(mktemp -d /tmp/__perf_test.program.XXXXX)
+TESTPROG="perf test -w brstack"
 
 cleanup() {
 	rm -rf $TMPDIR
@@ -26,57 +20,24 @@ cleanup() {
 
 trap cleanup exit term int
 
-gen_test_program() {
-	# generate test program
-	cat << EOF > $1
-#define BENCH_RUNS 999999
-int cnt;
-void bar(void) {
-}			/* return */
-void foo(void) {
-	bar();		/* call */
-}			/* return */
-void bench(void) {
-  void (*foo_ind)(void) = foo;
-  if ((cnt++) % 3)	/* branch (cond) */
-    foo();		/* call */
-  bar();		/* call */
-  foo_ind();		/* call (ind) */
-}
-int main(void)
-{
-  int cnt = 0;
-  while (1) {
-    if ((cnt++) > BENCH_RUNS)
-      break;
-    bench();		/* call */
-  }			/* branch (uncond) */
-  return 0;
-}
-EOF
-}
-
 test_user_branches() {
 	echo "Testing user branch stack sampling"
 
-	gen_test_program "$TEMPDIR/program.c"
-	cc -fno-inline -g "$TEMPDIR/program.c" -o $TMPDIR/a.out
-
-	perf record -o $TMPDIR/perf.data --branch-filter any,save_type,u -- $TMPDIR/a.out > /dev/null 2>&1
+	perf record -o $TMPDIR/perf.data --branch-filter any,save_type,u -- ${TESTPROG} > /dev/null 2>&1
 	perf script -i $TMPDIR/perf.data --fields brstacksym | xargs -n1 > $TMPDIR/perf.script
 
 	# example of branch entries:
-	# 	foo+0x14/bar+0x40/P/-/-/0/CALL
+	# 	brstack_foo+0x14/brstack_bar+0x40/P/-/-/0/CALL
 
 	set -x
-	egrep -m1 "^bench\+[^ ]*/foo\+[^ ]*/IND_CALL$"	$TMPDIR/perf.script
-	egrep -m1 "^foo\+[^ ]*/bar\+[^ ]*/CALL$"	$TMPDIR/perf.script
-	egrep -m1 "^bench\+[^ ]*/foo\+[^ ]*/CALL$"	$TMPDIR/perf.script
-	egrep -m1 "^bench\+[^ ]*/bar\+[^ ]*/CALL$"	$TMPDIR/perf.script
-	egrep -m1 "^bar\+[^ ]*/foo\+[^ ]*/RET$"		$TMPDIR/perf.script
-	egrep -m1 "^foo\+[^ ]*/bench\+[^ ]*/RET$"	$TMPDIR/perf.script
-	egrep -m1 "^bench\+[^ ]*/bench\+[^ ]*/COND$"	$TMPDIR/perf.script
-	egrep -m1 "^main\+[^ ]*/main\+[^ ]*/UNCOND$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack_bench\+[^ ]*/brstack_foo\+[^ ]*/IND_CALL$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack_foo\+[^ ]*/brstack_bar\+[^ ]*/CALL$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack_bench\+[^ ]*/brstack_foo\+[^ ]*/CALL$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack_bench\+[^ ]*/brstack_bar\+[^ ]*/CALL$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack_bar\+[^ ]*/brstack_foo\+[^ ]*/RET$"		$TMPDIR/perf.script
+	grep -E -m1 "^brstack_foo\+[^ ]*/brstack_bench\+[^ ]*/RET$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack_bench\+[^ ]*/brstack_bench\+[^ ]*/COND$"	$TMPDIR/perf.script
+	grep -E -m1 "^brstack\+[^ ]*/brstack\+[^ ]*/UNCOND$"		$TMPDIR/perf.script
 	set +x
 
 	# some branch types are still not being tested:
@@ -91,15 +52,12 @@ test_filter() {
 
 	echo "Testing branch stack filtering permutation ($filter,$expect)"
 
-	gen_test_program "$TEMPDIR/program.c"
-	cc -fno-inline -g "$TEMPDIR/program.c" -o $TMPDIR/a.out
-
-	perf record -o $TMPDIR/perf.data --branch-filter $filter,save_type,u -- $TMPDIR/a.out > /dev/null 2>&1
+	perf record -o $TMPDIR/perf.data --branch-filter $filter,save_type,u -- ${TESTPROG} > /dev/null 2>&1
 	perf script -i $TMPDIR/perf.data --fields brstack | xargs -n1 > $TMPDIR/perf.script
 
 	# fail if we find any branch type that doesn't match any of the expected ones
 	# also consider UNKNOWN branch types (-)
-	if egrep -vm1 "^[^ ]*/($expect|-|( *))$" $TMPDIR/perf.script; then
+	if grep -E -vm1 "^[^ ]*/($expect|-|( *))$" $TMPDIR/perf.script; then
 		return 1
 	fi
 }
diff --git a/tools/perf/tests/shell/test_data_symbol.sh b/tools/perf/tests/shell/test_data_symbol.sh
index cd6eb54d235d..69bb6fe86c50 100755
--- a/tools/perf/tests/shell/test_data_symbol.sh
+++ b/tools/perf/tests/shell/test_data_symbol.sh
@@ -5,19 +5,13 @@
 # Leo Yan <leo.yan@linaro.org>, 2022
 
 skip_if_no_mem_event() {
-	perf mem record -e list 2>&1 | egrep -q 'available' && return 0
+	perf mem record -e list 2>&1 | grep -E -q 'available' && return 0
 	return 2
 }
 
 skip_if_no_mem_event || exit 2
 
-# skip if there's no compiler
-if ! [ -x "$(command -v cc)" ]; then
-	echo "skip: no compiler, install gcc"
-	exit 2
-fi
-
-TEST_PROGRAM=$(mktemp /tmp/__perf_test.program.XXXXX)
+TEST_PROGRAM="perf test -w datasym"
 PERF_DATA=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
 
 check_result() {
@@ -45,37 +39,16 @@ cleanup_files()
 {
 	echo "Cleaning up files..."
 	rm -f ${PERF_DATA}
-	rm -f ${TEST_PROGRAM}
 }
 
 trap cleanup_files exit term int
 
-# compile test program
-echo "Compiling test program..."
-cat << EOF | cc -o ${TEST_PROGRAM} -x c -
-typedef struct _buf {
-	char data1;
-	char reserved[55];
-	char data2;
-} buf __attribute__((aligned(64)));
-
-static buf buf1;
-
-int main(void) {
-	for (;;) {
-		buf1.data1++;
-		buf1.data2 += buf1.data1;
-	}
-	return 0;
-}
-EOF
-
 echo "Recording workload..."
 
 # perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't support
 # user/kernel filtering and per-process monitoring, spin program on
 # specific CPU and test in per-CPU mode.
-is_amd=$(egrep -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
+is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
 if (($is_amd >= 1)); then
 	perf mem record -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM &
 else
diff --git a/tools/perf/tests/shell/test_java_symbol.sh b/tools/perf/tests/shell/test_java_symbol.sh
index f221225808a3..90cea8811926 100755
--- a/tools/perf/tests/shell/test_java_symbol.sh
+++ b/tools/perf/tests/shell/test_java_symbol.sh
@@ -65,7 +65,7 @@ fi
 #   8.18%  jshell           jitted-50116-29.so    [.] Interpreter
 #   0.75%  Thread-1         jitted-83602-1670.so  [.] jdk.internal.jimage.BasicImageReader.getString(int)
 perf report --stdio -i ${PERF_INJ_DATA} 2>&1 | \
-	egrep " +[0-9]+\.[0-9]+% .* (Interpreter|jdk\.internal).*" > /dev/null 2>&1
+	grep -E " +[0-9]+\.[0-9]+% .* (Interpreter|jdk\.internal).*" > /dev/null 2>&1
 
 if [ $? -ne 0 ]; then
 	echo "Fail to find java symbols"
diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh
new file mode 100755
index 000000000000..a98e4ab66040
--- /dev/null
+++ b/tools/perf/tests/shell/test_task_analyzer.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# perf script task-analyzer tests
+# SPDX-License-Identifier: GPL-2.0
+
+tmpdir=$(mktemp -d /tmp/perf-script-task-analyzer-XXXXX)
+err=0
+
+cleanup() {
+  rm -f perf.data
+  rm -f perf.data.old
+  rm -f csv
+  rm -f csvsummary
+  rm -rf $tmpdir
+  trap - exit term int
+}
+
+trap_cleanup() {
+  cleanup
+  exit 1
+}
+trap trap_cleanup exit term int
+
+report() {
+	if [ $1 = 0 ]; then
+		echo "PASS: \"$2\""
+	else
+		echo "FAIL: \"$2\" Error message: \"$3\""
+		err=1
+	fi
+}
+
+check_exec_0() {
+	if [ $? != 0 ]; then
+		report 1 "invokation of ${$1} command failed"
+	fi
+}
+
+find_str_or_fail() {
+	grep -q "$1" $2
+	if [ $? != 0 ]; then
+		report 1 $3 "Failed to find required string:'${1}'."
+	else
+		report 0 $3
+	fi
+}
+
+prepare_perf_data() {
+	# 1s should be sufficient to catch at least some switches
+	perf record -e sched:sched_switch -a -- sleep 1 > /dev/null 2>&1
+}
+
+# check standard inkvokation with no arguments
+test_basic() {
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Comm" $out ${FUNCNAME[0]}
+}
+
+test_ns_rename(){
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --ns --rename-comms-by-tids 0:random > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Comm" $out ${FUNCNAME[0]}
+}
+
+test_ms_filtertasks_highlight(){
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --ms --filter-tasks perf --highlight-tasks perf \
+	> $out
+	check_exec_0 "perf"
+	find_str_or_fail "Comm" $out ${FUNCNAME[0]}
+}
+
+test_extended_times_timelimit_limittasks() {
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --extended-times --time-limit :99999 \
+	--limit-to-tasks perf > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Out-Out" $out ${FUNCNAME[0]}
+}
+
+test_summary() {
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --summary > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Summary" $out ${FUNCNAME[0]}
+}
+
+test_summaryextended() {
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --summary-extended > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Inter Task Times" $out ${FUNCNAME[0]}
+}
+
+test_summaryonly() {
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --summary-only > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Summary" $out ${FUNCNAME[0]}
+}
+
+test_extended_times_summary_ns() {
+	out="$tmpdir/perf.out"
+	perf script report task-analyzer --extended-times --summary --ns > $out
+	check_exec_0 "perf"
+	find_str_or_fail "Out-Out" $out ${FUNCNAME[0]}
+	find_str_or_fail "Summary" $out ${FUNCNAME[0]}
+}
+
+test_csv() {
+	perf script report task-analyzer --csv csv > /dev/null
+	check_exec_0 "perf"
+	find_str_or_fail "Comm;" csv ${FUNCNAME[0]}
+}
+
+test_csv_extended_times() {
+	perf script report task-analyzer --csv csv --extended-times > /dev/null
+	check_exec_0 "perf"
+	find_str_or_fail "Out-Out;" csv ${FUNCNAME[0]}
+}
+
+test_csvsummary() {
+	perf script report task-analyzer --csv-summary csvsummary > /dev/null
+	check_exec_0 "perf"
+	find_str_or_fail "Comm;" csvsummary ${FUNCNAME[0]}
+}
+
+test_csvsummary_extended() {
+	perf script report task-analyzer --csv-summary csvsummary --summary-extended \
+	>/dev/null
+	check_exec_0 "perf"
+	find_str_or_fail "Out-Out;" csvsummary ${FUNCNAME[0]}
+}
+
+prepare_perf_data
+test_basic
+test_ns_rename
+test_ms_filtertasks_highlight
+test_extended_times_timelimit_limittasks
+test_summary
+test_summaryextended
+test_summaryonly
+test_extended_times_summary_ns
+test_csv
+test_csvsummary
+test_csv_extended_times
+test_csvsummary_extended
+cleanup
+exit $err
diff --git a/tools/perf/tests/shell/trace+probe_vfs_getname.sh b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
index 3d60e993d2b8..0a4bac3dd77e 100755
--- a/tools/perf/tests/shell/trace+probe_vfs_getname.sh
+++ b/tools/perf/tests/shell/trace+probe_vfs_getname.sh
@@ -18,9 +18,9 @@ skip_if_no_perf_trace || exit 2
 . $(dirname $0)/lib/probe_vfs_getname.sh
 
 trace_open_vfs_getname() {
-	evts=$(echo $(perf list syscalls:sys_enter_open* 2>/dev/null | egrep 'open(at)? ' | sed -r 's/.*sys_enter_([a-z]+) +\[.*$/\1/') | sed 's/ /,/')
+	evts=$(echo $(perf list syscalls:sys_enter_open* 2>/dev/null | grep -E 'open(at)? ' | sed -r 's/.*sys_enter_([a-z]+) +\[.*$/\1/') | sed 's/ /,/')
 	perf trace -e $evts touch $file 2>&1 | \
-	egrep " +[0-9]+\.[0-9]+ +\( +[0-9]+\.[0-9]+ ms\): +touch\/[0-9]+ open(at)?\((dfd: +CWD, +)?filename: +${file}, +flags: CREAT\|NOCTTY\|NONBLOCK\|WRONLY, +mode: +IRUGO\|IWUGO\) += +[0-9]+$"
+	grep -E " +[0-9]+\.[0-9]+ +\( +[0-9]+\.[0-9]+ ms\): +touch\/[0-9]+ open(at)?\((dfd: +CWD, +)?filename: +${file}, +flags: CREAT\|NOCTTY\|NONBLOCK\|WRONLY, +mode: +IRUGO\|IWUGO\) += +[0-9]+$"
 }
 
 
diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 9cd6fec375ee..4d7493fa0105 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -13,6 +13,7 @@
 #include "util/evlist.h"
 #include "util/cpumap.h"
 #include "util/mmap.h"
+#include "util/sample.h"
 #include "util/thread_map.h"
 #include <perf/evlist.h>
 #include <perf/mmap.h>
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index 87f565c7f650..b3bd14b025a8 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -19,6 +19,7 @@
 #include "record.h"
 #include "tests.h"
 #include "util/mmap.h"
+#include "util/sample.h"
 #include "pmu.h"
 
 static int spin_sleep(void)
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 5bbb8f6a48fc..fb4b5ad4dd0f 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -147,6 +147,7 @@ DECLARE_SUITE(expand_cgroup_events);
 DECLARE_SUITE(perf_time_to_tsc);
 DECLARE_SUITE(dlfilter);
 DECLARE_SUITE(sigtrap);
+DECLARE_SUITE(event_groups);
 
 /*
  * PowerPC and S390 do not support creation of instruction breakpoints using the
@@ -180,4 +181,31 @@ int test__arch_unwind_sample(struct perf_sample *sample,
 DECLARE_SUITE(vectors_page);
 #endif
 
+/*
+ * Define test workloads to be used in test suites.
+ */
+typedef int (*workload_fnptr)(int argc, const char **argv);
+
+struct test_workload {
+	const char	*name;
+	workload_fnptr	func;
+};
+
+#define DECLARE_WORKLOAD(work) \
+	extern struct test_workload workload__##work
+
+#define DEFINE_WORKLOAD(work) \
+struct test_workload workload__##work = {	\
+	.name = #work,				\
+	.func = work,				\
+}
+
+/* The list of test workloads */
+DECLARE_WORKLOAD(noploop);
+DECLARE_WORKLOAD(thloop);
+DECLARE_WORKLOAD(leafloop);
+DECLARE_WORKLOAD(sqrtloop);
+DECLARE_WORKLOAD(brstack);
+DECLARE_WORKLOAD(datasym);
+
 #endif /* TESTS_H */
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index e413c1387fcb..74308c1368fe 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -11,6 +11,7 @@
 #include "util/synthetic-events.h"
 #include <linux/zalloc.h>
 #include <perf/event.h>
+#include <internal/threadmap.h>
 
 struct perf_sample;
 struct perf_tool;
diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build
new file mode 100644
index 000000000000..a1f34d5861e3
--- /dev/null
+++ b/tools/perf/tests/workloads/Build
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+
+perf-y += noploop.o
+perf-y += thloop.o
+perf-y += leafloop.o
+perf-y += sqrtloop.o
+perf-y += brstack.o
+perf-y += datasym.o
+
+CFLAGS_sqrtloop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
+CFLAGS_leafloop.o         = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE
+CFLAGS_brstack.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
+CFLAGS_datasym.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
diff --git a/tools/perf/tests/workloads/brstack.c b/tools/perf/tests/workloads/brstack.c
new file mode 100644
index 000000000000..0b60bd37b9d1
--- /dev/null
+++ b/tools/perf/tests/workloads/brstack.c
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
+#include "../tests.h"
+
+#define BENCH_RUNS 999999
+
+static volatile int cnt;
+
+static void brstack_bar(void) {
+}				/* return */
+
+static void brstack_foo(void) {
+	brstack_bar();		/* call */
+}				/* return */
+
+static void brstack_bench(void) {
+	void (*brstack_foo_ind)(void) = brstack_foo;
+
+	if ((cnt++) % 3)	/* branch (cond) */
+		brstack_foo();	/* call */
+	brstack_bar();		/* call */
+	brstack_foo_ind();	/* call (ind) */
+}
+
+static int brstack(int argc, const char **argv)
+{
+	int num_loops = BENCH_RUNS;
+
+	if (argc > 0)
+		num_loops = atoi(argv[0]);
+
+	while (1) {
+		if ((cnt++) > num_loops)
+			break;
+		brstack_bench();/* call */
+	}			/* branch (uncond) */
+	return 0;
+}
+
+DEFINE_WORKLOAD(brstack);
diff --git a/tools/perf/tests/workloads/datasym.c b/tools/perf/tests/workloads/datasym.c
new file mode 100644
index 000000000000..ddd40bc63448
--- /dev/null
+++ b/tools/perf/tests/workloads/datasym.c
@@ -0,0 +1,24 @@
+#include <linux/compiler.h>
+#include "../tests.h"
+
+typedef struct _buf {
+	char data1;
+	char reserved[55];
+	char data2;
+} buf __attribute__((aligned(64)));
+
+static buf buf1 = {
+	/* to have this in the data section */
+	.reserved[0] = 1,
+};
+
+static int datasym(int argc __maybe_unused, const char **argv __maybe_unused)
+{
+	for (;;) {
+		buf1.data1++;
+		buf1.data2 += buf1.data1;
+	}
+	return 0;
+}
+
+DEFINE_WORKLOAD(datasym);
diff --git a/tools/perf/tests/workloads/leafloop.c b/tools/perf/tests/workloads/leafloop.c
new file mode 100644
index 000000000000..1bf5cc97649b
--- /dev/null
+++ b/tools/perf/tests/workloads/leafloop.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+/* We want to check these symbols in perf script */
+noinline void leaf(volatile int b);
+noinline void parent(volatile int b);
+
+static volatile int a;
+
+noinline void leaf(volatile int b)
+{
+	for (;;)
+		a += b;
+}
+
+noinline void parent(volatile int b)
+{
+	leaf(b);
+}
+
+static int leafloop(int argc, const char **argv)
+{
+	int c = 1;
+
+	if (argc > 0)
+		c = atoi(argv[0]);
+
+	parent(c);
+	return 0;
+}
+
+DEFINE_WORKLOAD(leafloop);
diff --git a/tools/perf/tests/workloads/noploop.c b/tools/perf/tests/workloads/noploop.c
new file mode 100644
index 000000000000..940ea5910a84
--- /dev/null
+++ b/tools/perf/tests/workloads/noploop.c
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+static volatile sig_atomic_t done;
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static int noploop(int argc, const char **argv)
+{
+	int sec = 1;
+
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	signal(SIGINT, sighandler);
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	while (!done)
+		continue;
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(noploop);
diff --git a/tools/perf/tests/workloads/sqrtloop.c b/tools/perf/tests/workloads/sqrtloop.c
new file mode 100644
index 000000000000..ccc94c6a6676
--- /dev/null
+++ b/tools/perf/tests/workloads/sqrtloop.c
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <math.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include <sys/wait.h>
+#include "../tests.h"
+
+static volatile sig_atomic_t done;
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static int __sqrtloop(int sec)
+{
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	while (!done)
+		(void)sqrt(rand());
+	return 0;
+}
+
+static int sqrtloop(int argc, const char **argv)
+{
+	int sec = 1;
+
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	switch (fork()) {
+	case 0:
+		return __sqrtloop(sec);
+	case -1:
+		return -1;
+	default:
+		wait(NULL);
+	}
+	return 0;
+}
+
+DEFINE_WORKLOAD(sqrtloop);
diff --git a/tools/perf/tests/workloads/thloop.c b/tools/perf/tests/workloads/thloop.c
new file mode 100644
index 000000000000..29193b75717e
--- /dev/null
+++ b/tools/perf/tests/workloads/thloop.c
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+static volatile sig_atomic_t done;
+static volatile unsigned count;
+
+/* We want to check this symbol in perf report */
+noinline void test_loop(void);
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+noinline void test_loop(void)
+{
+	while (!done)
+		count++;
+}
+
+static void *thfunc(void *arg)
+{
+	void (*loop_fn)(void) = arg;
+
+	loop_fn();
+	return NULL;
+}
+
+static int thloop(int argc, const char **argv)
+{
+	int sec = 1;
+	pthread_t th;
+
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	signal(SIGINT, sighandler);
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	pthread_create(&th, NULL, thfunc, test_loop);
+	test_loop();
+	pthread_join(th, NULL);
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(thloop);
diff --git a/tools/perf/tests/wp.c b/tools/perf/tests/wp.c
index 56455da30341..cc8719609b19 100644
--- a/tools/perf/tests/wp.c
+++ b/tools/perf/tests/wp.c
@@ -59,8 +59,10 @@ static int __event(int wp_type, void *wp_addr, unsigned long wp_len)
 	get__perf_event_attr(&attr, wp_type, wp_addr, wp_len);
 	fd = sys_perf_event_open(&attr, 0, -1, -1,
 				 perf_event_open_cloexec_flag());
-	if (fd < 0)
+	if (fd < 0) {
+		fd = -errno;
 		pr_debug("failed opening event %x\n", attr.bp_type);
+	}
 
 	return fd;
 }
@@ -77,7 +79,7 @@ static int test__wp_ro(struct test_suite *test __maybe_unused,
 
 	fd = __event(HW_BREAKPOINT_R, (void *)&data1, sizeof(data1));
 	if (fd < 0)
-		return -1;
+		return fd == -ENODEV ? TEST_SKIP : -1;
 
 	tmp = data1;
 	WP_TEST_ASSERT_VAL(fd, "RO watchpoint", 1);
@@ -101,7 +103,7 @@ static int test__wp_wo(struct test_suite *test __maybe_unused,
 
 	fd = __event(HW_BREAKPOINT_W, (void *)&data1, sizeof(data1));
 	if (fd < 0)
-		return -1;
+		return fd == -ENODEV ? TEST_SKIP : -1;
 
 	tmp = data1;
 	WP_TEST_ASSERT_VAL(fd, "WO watchpoint", 0);
@@ -126,7 +128,7 @@ static int test__wp_rw(struct test_suite *test __maybe_unused,
 	fd = __event(HW_BREAKPOINT_R | HW_BREAKPOINT_W, (void *)&data1,
 		     sizeof(data1));
 	if (fd < 0)
-		return -1;
+		return fd == -ENODEV ? TEST_SKIP : -1;
 
 	tmp = data1;
 	WP_TEST_ASSERT_VAL(fd, "RW watchpoint", 1);
@@ -150,7 +152,7 @@ static int test__wp_modify(struct test_suite *test __maybe_unused, int subtest _
 
 	fd = __event(HW_BREAKPOINT_W, (void *)&data1, sizeof(data1));
 	if (fd < 0)
-		return -1;
+		return fd == -ENODEV ? TEST_SKIP : -1;
 
 	data1 = tmp;
 	WP_TEST_ASSERT_VAL(fd, "Modify watchpoint", 1);
diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build
index 433dc39053a7..d11ce256f511 100644
--- a/tools/perf/trace/beauty/Build
+++ b/tools/perf/trace/beauty/Build
@@ -17,4 +17,5 @@ perf-y += sockaddr.o
 perf-y += socket.o
 perf-y += statx.o
 perf-y += sync_file_range.o
+perf-y += timespec.o
 perf-y += tracepoints/
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
index f527a46ab4e7..4c59edddd6a8 100644
--- a/tools/perf/trace/beauty/beauty.h
+++ b/tools/perf/trace/beauty/beauty.h
@@ -244,6 +244,9 @@ size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_a
 size_t syscall_arg__scnprintf_sync_file_range_flags(char *bf, size_t size, struct syscall_arg *arg);
 #define SCA_SYNC_FILE_RANGE_FLAGS syscall_arg__scnprintf_sync_file_range_flags
 
+size_t syscall_arg__scnprintf_timespec(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_TIMESPEC syscall_arg__scnprintf_timespec
+
 size_t open__scnprintf_flags(unsigned long flags, char *bf, size_t size, bool show_prefix);
 
 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
diff --git a/tools/perf/trace/beauty/fadvise.sh b/tools/perf/trace/beauty/fadvise.sh
index b15ae3875167..4d3dd6e56ded 100755
--- a/tools/perf/trace/beauty/fadvise.sh
+++ b/tools/perf/trace/beauty/fadvise.sh
@@ -6,7 +6,7 @@
 printf "static const char *fadvise_advices[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+POSIX_FADV_(\w+)[[:space:]]+([[:digit:]]+)[[:space:]]+.*'
 
-egrep $regex ${header_dir}/fadvise.h | \
+grep -E $regex ${header_dir}/fadvise.h | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort | xargs printf "\t[%s] = \"%s\",\n" | \
 	grep -v "[6].*DONTNEED" | grep -v "[7].*NOREUSE"
diff --git a/tools/perf/trace/beauty/fsmount.sh b/tools/perf/trace/beauty/fsmount.sh
index 615cc0fcf4f9..cba8897a751f 100755
--- a/tools/perf/trace/beauty/fsmount.sh
+++ b/tools/perf/trace/beauty/fsmount.sh
@@ -16,7 +16,7 @@ linux_mount=${linux_header_dir}/mount.h
 
 printf "static const char *fsmount_attr_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MOUNT_ATTR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${linux_mount} | grep -v MOUNT_ATTR_RELATIME | \
+grep -E $regex ${linux_mount} | grep -v MOUNT_ATTR_RELATIME | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/fspick.sh b/tools/perf/trace/beauty/fspick.sh
index b220e07ef452..1f088329b96e 100755
--- a/tools/perf/trace/beauty/fspick.sh
+++ b/tools/perf/trace/beauty/fspick.sh
@@ -11,7 +11,7 @@ linux_mount=${linux_header_dir}/mount.h
 
 printf "static const char *fspick_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+FSPICK_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${linux_mount} | \
+grep -E $regex ${linux_mount} | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/kcmp_type.sh b/tools/perf/trace/beauty/kcmp_type.sh
index df8b17486d57..5df9dcbd674e 100755
--- a/tools/perf/trace/beauty/kcmp_type.sh
+++ b/tools/perf/trace/beauty/kcmp_type.sh
@@ -5,7 +5,7 @@
 
 printf "static const char *kcmp_types[] = {\n"
 regex='^[[:space:]]+(KCMP_(\w+)),'
-egrep $regex ${header_dir}/kcmp.h | grep -v KCMP_TYPES, | \
+grep -E $regex ${header_dir}/kcmp.h | grep -v KCMP_TYPES, | \
 	sed -r "s/$regex/\1 \2/g" | \
 	xargs printf "\t[%s]\t= \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/kvm_ioctl.sh b/tools/perf/trace/beauty/kvm_ioctl.sh
index 4ce54f5bf756..bd0efd45fa9d 100755
--- a/tools/perf/trace/beauty/kvm_ioctl.sh
+++ b/tools/perf/trace/beauty/kvm_ioctl.sh
@@ -5,8 +5,8 @@
 
 printf "static const char *kvm_ioctl_cmds[] = {\n"
 regex='^#[[:space:]]*define[[:space:]]+KVM_(\w+)[[:space:]]+_IO[RW]*\([[:space:]]*KVMIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*'
-egrep $regex ${header_dir}/kvm.h	| \
+grep -E $regex ${header_dir}/kvm.h	| \
 	sed -r "s/$regex/\2 \1/g"	| \
-	egrep -v " ((ARM|PPC|S390)_|[GS]ET_(DEBUGREGS|PIT2|XSAVE|TSC_KHZ)|CREATE_SPAPR_TCE_64)" | \
+	grep -E -v " ((ARM|PPC|S390)_|[GS]ET_(DEBUGREGS|PIT2|XSAVE|TSC_KHZ)|CREATE_SPAPR_TCE_64)" | \
 	sort | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/madvise_behavior.sh b/tools/perf/trace/beauty/madvise_behavior.sh
index 4527d290cdfc..c659c33bfc07 100755
--- a/tools/perf/trace/beauty/madvise_behavior.sh
+++ b/tools/perf/trace/beauty/madvise_behavior.sh
@@ -5,7 +5,7 @@
 
 printf "static const char *madvise_advices[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MADV_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*'
-egrep $regex ${header_dir}/mman-common.h | \
+grep -E $regex ${header_dir}/mman-common.h | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort -n | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/mmap_flags.sh b/tools/perf/trace/beauty/mmap_flags.sh
index 76825710c725..3022597c8c17 100755
--- a/tools/perf/trace/beauty/mmap_flags.sh
+++ b/tools/perf/trace/beauty/mmap_flags.sh
@@ -15,26 +15,26 @@ fi
 linux_mman=${linux_header_dir}/mman.h
 arch_mman=${arch_header_dir}/mman.h
 
-# those in egrep -vw are flags, we want just the bits
+# those in grep -E -vw are flags, we want just the bits
 
 printf "static const char *mmap_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MAP_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*'
-egrep -q $regex ${arch_mman} && \
-(egrep $regex ${arch_mman} | \
+grep -E -q $regex ${arch_mman} && \
+(grep -E $regex ${arch_mman} | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef MAP_%s\n#define MAP_%s %s\n#endif\n")
-egrep -q $regex ${linux_mman} && \
-(egrep $regex ${linux_mman} | \
-	egrep -vw 'MAP_(UNINITIALIZED|TYPE|SHARED_VALIDATE)' | \
+grep -E -q $regex ${linux_mman} && \
+(grep -E $regex ${linux_mman} | \
+	grep -E -vw 'MAP_(UNINITIALIZED|TYPE|SHARED_VALIDATE)' | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g" | \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef MAP_%s\n#define MAP_%s %s\n#endif\n")
-([ ! -f ${arch_mman} ] || egrep -q '#[[:space:]]*include[[:space:]]+.*uapi/asm-generic/mman.*' ${arch_mman}) &&
-(egrep $regex ${header_dir}/mman-common.h | \
-	egrep -vw 'MAP_(UNINITIALIZED|TYPE|SHARED_VALIDATE)' | \
+([ ! -f ${arch_mman} ] || grep -E -q '#[[:space:]]*include[[:space:]]+.*uapi/asm-generic/mman.*' ${arch_mman}) &&
+(grep -E $regex ${header_dir}/mman-common.h | \
+	grep -E -vw 'MAP_(UNINITIALIZED|TYPE|SHARED_VALIDATE)' | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef MAP_%s\n#define MAP_%s %s\n#endif\n")
-([ ! -f ${arch_mman} ] || egrep -q '#[[:space:]]*include[[:space:]]+.*uapi/asm-generic/mman.h>.*' ${arch_mman}) &&
-(egrep $regex ${header_dir}/mman.h | \
+([ ! -f ${arch_mman} ] || grep -E -q '#[[:space:]]*include[[:space:]]+.*uapi/asm-generic/mman.h>.*' ${arch_mman}) &&
+(grep -E $regex ${header_dir}/mman.h | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef MAP_%s\n#define MAP_%s %s\n#endif\n")
 printf "};\n"
diff --git a/tools/perf/trace/beauty/mmap_prot.sh b/tools/perf/trace/beauty/mmap_prot.sh
index 664d8d534a50..49e8c865214b 100755
--- a/tools/perf/trace/beauty/mmap_prot.sh
+++ b/tools/perf/trace/beauty/mmap_prot.sh
@@ -17,14 +17,14 @@ prefix="PROT"
 
 printf "static const char *mmap_prot[] = {\n"
 regex=`printf '^[[:space:]]*#[[:space:]]*define[[:space:]]+%s_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' ${prefix}`
-([ ! -f ${arch_mman} ] || egrep -q '#[[:space:]]*include[[:space:]]+.*uapi/asm-generic/mman.*' ${arch_mman}) &&
-(egrep $regex ${common_mman} | \
-	egrep -vw PROT_NONE | \
+([ ! -f ${arch_mman} ] || grep -E -q '#[[:space:]]*include[[:space:]]+.*uapi/asm-generic/mman.*' ${arch_mman}) &&
+(grep -E $regex ${common_mman} | \
+	grep -E -vw PROT_NONE | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef ${prefix}_%s\n#define ${prefix}_%s %s\n#endif\n")
-[ -f ${arch_mman} ] && egrep -q $regex ${arch_mman} && 
-(egrep $regex ${arch_mman} | \
-	egrep -vw PROT_NONE | \
+[ -f ${arch_mman} ] && grep -E -q $regex ${arch_mman} &&
+(grep -E $regex ${arch_mman} | \
+	grep -E -vw PROT_NONE | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef ${prefix}_%s\n#define ${prefix}_%s %s\n#endif\n")
 printf "};\n"
diff --git a/tools/perf/trace/beauty/mount_flags.sh b/tools/perf/trace/beauty/mount_flags.sh
index 847850b2ef6c..730099a9a67c 100755
--- a/tools/perf/trace/beauty/mount_flags.sh
+++ b/tools/perf/trace/beauty/mount_flags.sh
@@ -5,11 +5,11 @@
 
 printf "static const char *mount_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+([[:digit:]]+)[[:space:]]*.*'
-egrep $regex ${header_dir}/mount.h | egrep -v '(MSK|VERBOSE|MGC_VAL)\>' | \
+grep -E $regex ${header_dir}/mount.h | grep -E -v '(MSK|VERBOSE|MGC_VAL)\>' | \
 	sed -r "s/$regex/\2 \2 \1/g" | sort -n | \
 	xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MS_([[:alnum:]_]+)[[:space:]]+\(1<<([[:digit:]]+)\)[[:space:]]*.*'
-egrep $regex ${header_dir}/mount.h | \
+grep -E $regex ${header_dir}/mount.h | \
 	sed -r "s/$regex/\2 \1/g" | \
 	xargs printf "\t[%s + 1] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/move_mount_flags.sh b/tools/perf/trace/beauty/move_mount_flags.sh
index 4b1d9acc0bd0..32e552faf37a 100755
--- a/tools/perf/trace/beauty/move_mount_flags.sh
+++ b/tools/perf/trace/beauty/move_mount_flags.sh
@@ -11,7 +11,7 @@ linux_mount=${linux_header_dir}/mount.h
 
 printf "static const char *move_mount_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MOVE_MOUNT_([^_]+_[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${linux_mount} | \
+grep -E $regex ${linux_mount} | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/mremap_flags.sh b/tools/perf/trace/beauty/mremap_flags.sh
index d58182300bb1..4d018350183b 100755
--- a/tools/perf/trace/beauty/mremap_flags.sh
+++ b/tools/perf/trace/beauty/mremap_flags.sh
@@ -11,8 +11,8 @@ linux_mman=${linux_header_dir}/mman.h
 
 printf "static const char *mremap_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MREMAP_([[:alnum:]_]+)[[:space:]]+((0x)?[[:xdigit:]]+)[[:space:]]*.*'
-egrep -q $regex ${linux_mman} && \
-(egrep $regex ${linux_mman} | \
+grep -E -q $regex ${linux_mman} && \
+(grep -E $regex ${linux_mman} | \
 	sed -r "s/$regex/\2 \1 \1 \1 \2/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n#ifndef MREMAP_%s\n#define MREMAP_%s %s\n#endif\n")
 printf "};\n"
diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c
index 11d47dbe63bd..01ee15fe9d0c 100644
--- a/tools/perf/trace/beauty/perf_event_open.c
+++ b/tools/perf/trace/beauty/perf_event_open.c
@@ -44,3 +44,47 @@ static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
 }
 
 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
+
+struct attr_fprintf_args {
+	size_t size, printed;
+	char *bf;
+	bool first;
+};
+
+static int attr__fprintf(FILE *fp __maybe_unused, const char *name, const char *val, void *priv)
+{
+	struct attr_fprintf_args *args = priv;
+	size_t printed = scnprintf(args->bf + args->printed , args->size - args->printed, "%s%s: %s", args->first ? "" : ", ", name, val);
+
+	args->first = false;
+	args->printed += printed;
+	return printed;
+}
+
+static size_t perf_event_attr___scnprintf(struct perf_event_attr *attr, char *bf, size_t size, bool show_zeros __maybe_unused)
+{
+	struct attr_fprintf_args args = {
+		.printed = scnprintf(bf, size, "{ "),
+		.size    = size,
+		.first   = true,
+		.bf	 = bf,
+	};
+
+	perf_event_attr__fprintf(stdout, attr, attr__fprintf, &args);
+	return args.printed + scnprintf(bf + args.printed, size - args.printed, " }");
+}
+
+static size_t syscall_arg__scnprintf_augmented_perf_event_attr(struct syscall_arg *arg, char *bf, size_t size)
+{
+	return perf_event_attr___scnprintf((void *)arg->augmented.args, bf, size, arg->trace->show_zeros);
+}
+
+static size_t syscall_arg__scnprintf_perf_event_attr(char *bf, size_t size, struct syscall_arg *arg)
+{
+	if (arg->augmented.args)
+		return syscall_arg__scnprintf_augmented_perf_event_attr(arg, bf, size);
+
+	return scnprintf(bf, size, "%#lx", arg->val);
+}
+
+#define SCA_PERF_ATTR syscall_arg__scnprintf_perf_event_attr
diff --git a/tools/perf/trace/beauty/perf_ioctl.sh b/tools/perf/trace/beauty/perf_ioctl.sh
index 9aabd9743ef6..06c2774f3262 100755
--- a/tools/perf/trace/beauty/perf_ioctl.sh
+++ b/tools/perf/trace/beauty/perf_ioctl.sh
@@ -5,7 +5,7 @@
 
 printf "static const char *perf_ioctl_cmds[] = {\n"
 regex='^#[[:space:]]*define[[:space:]]+PERF_EVENT_IOC_(\w+)[[:space:]]+_IO[RW]*[[:space:]]*\([[:space:]]*.\$.[[:space:]]*,[[:space:]]*([[:digit:]]+).*'
-egrep $regex ${header_dir}/perf_event.h	| \
+grep -E $regex ${header_dir}/perf_event.h	| \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh
index f8f1b560cf8a..74da888bcdd3 100755
--- a/tools/perf/trace/beauty/pkey_alloc_access_rights.sh
+++ b/tools/perf/trace/beauty/pkey_alloc_access_rights.sh
@@ -5,7 +5,7 @@
 
 printf "static const char *pkey_alloc_access_rights[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+PKEY_([[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*'
-egrep $regex ${header_dir}/mman-common.h	| \
+grep -E $regex ${header_dir}/mman-common.h	| \
 	sed -r "s/$regex/\2 \2 \1/g"	| \
 	sort | xargs printf "\t[%s ? (ilog2(%s) + 1) : 0] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/prctl_option.sh b/tools/perf/trace/beauty/prctl_option.sh
index 3d278785fe57..8059342ca412 100755
--- a/tools/perf/trace/beauty/prctl_option.sh
+++ b/tools/perf/trace/beauty/prctl_option.sh
@@ -5,14 +5,14 @@
 
 printf "static const char *prctl_options[] = {\n"
 regex='^#define[[:space:]]{1}PR_(\w+)[[:space:]]*([[:xdigit:]]+)([[:space:]]*\/.*)?$'
-egrep $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \
+grep -E $regex ${header_dir}/prctl.h | grep -v PR_SET_PTRACER | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort -n | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
 
 printf "static const char *prctl_set_mm_options[] = {\n"
 regex='^#[[:space:]]+define[[:space:]]+PR_SET_MM_(\w+)[[:space:]]*([[:digit:]]+).*'
-egrep $regex ${header_dir}/prctl.h | \
+grep -E $regex ${header_dir}/prctl.h | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort -n | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/rename_flags.sh b/tools/perf/trace/beauty/rename_flags.sh
index 54c87c782ab2..94bf7f45d28e 100755
--- a/tools/perf/trace/beauty/rename_flags.sh
+++ b/tools/perf/trace/beauty/rename_flags.sh
@@ -8,8 +8,8 @@ fs_header=${header_dir}/fs.h
 
 printf "static const char *rename_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+RENAME_([[:alnum:]_]+)[[:space:]]+\(1[[:space:]]*<<[[:space:]]*([[:xdigit:]]+)[[:space:]]*\)[[:space:]]*.*'
-egrep -q $regex ${fs_header} && \
-(egrep $regex ${fs_header} | \
+grep -E -q $regex ${fs_header} && \
+(grep -E $regex ${fs_header} | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	xargs printf "\t[%d + 1] = \"%s\",\n")
 printf "};\n"
diff --git a/tools/perf/trace/beauty/sockaddr.sh b/tools/perf/trace/beauty/sockaddr.sh
index 3820e5c82293..a59827ea1437 100755
--- a/tools/perf/trace/beauty/sockaddr.sh
+++ b/tools/perf/trace/beauty/sockaddr.sh
@@ -17,8 +17,8 @@ printf "static const char *socket_families[] = {\n"
 # #define AF_LOCAL	1	/* POSIX name for AF_UNIX	*/
 regex='^#define[[:space:]]+AF_(\w+)[[:space:]]+([[:digit:]]+).*'
 
-egrep $regex ${header_dir}/socket.h | \
+grep -E $regex ${header_dir}/socket.h | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	xargs printf "\t[%s] = \"%s\",\n" | \
-	egrep -v "\"(UNIX|MAX)\""
+	grep -E -v "\"(UNIX|MAX)\""
 printf "};\n"
diff --git a/tools/perf/trace/beauty/socket.sh b/tools/perf/trace/beauty/socket.sh
index 76330acb27e5..8bc7ba62203e 100755
--- a/tools/perf/trace/beauty/socket.sh
+++ b/tools/perf/trace/beauty/socket.sh
@@ -12,7 +12,7 @@ fi
 printf "static const char *socket_ipproto[] = {\n"
 ipproto_regex='^[[:space:]]+IPPROTO_(\w+)[[:space:]]+=[[:space:]]+([[:digit:]]+),.*'
 
-egrep $ipproto_regex ${uapi_header_dir}/in.h | \
+grep -E $ipproto_regex ${uapi_header_dir}/in.h | \
 	sed -r "s/$ipproto_regex/\2 \1/g"	| \
 	sort -n | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"
@@ -20,7 +20,7 @@ printf "};\n\n"
 printf "static const char *socket_level[] = {\n"
 socket_level_regex='^#define[[:space:]]+SOL_(\w+)[[:space:]]+([[:digit:]]+)([[:space:]]+\/.*)?'
 
-egrep $socket_level_regex ${beauty_header_dir}/socket.h | \
+grep -E $socket_level_regex ${beauty_header_dir}/socket.h | \
 	sed -r "s/$socket_level_regex/\2 \1/g"	| \
 	sort -n | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"
diff --git a/tools/perf/trace/beauty/sync_file_range.sh b/tools/perf/trace/beauty/sync_file_range.sh
index 7a9282d04e44..90bf633be879 100755
--- a/tools/perf/trace/beauty/sync_file_range.sh
+++ b/tools/perf/trace/beauty/sync_file_range.sh
@@ -11,7 +11,7 @@ linux_fs=${linux_header_dir}/fs.h
 
 printf "static const char *sync_file_range_flags[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+SYNC_FILE_RANGE_([[:alnum:]_]+)[[:space:]]+([[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${linux_fs} | \
+grep -E $regex ${linux_fs} | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/timespec.c b/tools/perf/trace/beauty/timespec.c
new file mode 100644
index 000000000000..e1a61f092aad
--- /dev/null
+++ b/tools/perf/trace/beauty/timespec.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: LGPL-2.1
+// Copyright (C) 2022, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+
+#include "trace/beauty/beauty.h"
+#include <inttypes.h>
+#include <time.h>
+
+static size_t syscall_arg__scnprintf_augmented_timespec(struct syscall_arg *arg, char *bf, size_t size)
+{
+	struct timespec *ts = (struct timespec *)arg->augmented.args;
+
+	return scnprintf(bf, size, "{ .tv_sec: %" PRIu64 ", .tv_nsec: %" PRIu64 " }", ts->tv_sec, ts->tv_nsec);
+}
+
+size_t syscall_arg__scnprintf_timespec(char *bf, size_t size, struct syscall_arg *arg)
+{
+	if (arg->augmented.args)
+		return syscall_arg__scnprintf_augmented_timespec(arg, bf, size);
+
+	return scnprintf(bf, size, "%#lx", arg->val);
+}
diff --git a/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh b/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh
index f920003723b3..eed9ce0fcbe6 100755
--- a/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh
+++ b/tools/perf/trace/beauty/tracepoints/x86_irq_vectors.sh
@@ -15,12 +15,12 @@ x86_irq_vectors=${arch_x86_header_dir}/irq_vectors.h
 # the time of writing of this script was: IRQ_MOVE_CLEANUP_VECTOR.
 
 first_external_regex='^#define[[:space:]]+FIRST_EXTERNAL_VECTOR[[:space:]]+(0x[[:xdigit:]]+)$'
-first_external_vector=$(egrep ${first_external_regex} ${x86_irq_vectors} | sed -r "s/${first_external_regex}/\1/g")
+first_external_vector=$(grep -E ${first_external_regex} ${x86_irq_vectors} | sed -r "s/${first_external_regex}/\1/g")
 
 printf "static const char *x86_irq_vectors[] = {\n"
 regex='^#define[[:space:]]+([[:alnum:]_]+)_VECTOR[[:space:]]+(0x[[:xdigit:]]+)$'
 sed -r "s/FIRST_EXTERNAL_VECTOR/${first_external_vector}/g" ${x86_irq_vectors} | \
-egrep ${regex} | \
+grep -E ${regex} | \
 	sed -r "s/${regex}/\2 \1/g" | sort -n | \
 	xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"
diff --git a/tools/perf/trace/beauty/tracepoints/x86_msr.sh b/tools/perf/trace/beauty/tracepoints/x86_msr.sh
index 9b0614a87831..0078689963e0 100755
--- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh
+++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh
@@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr-index.h
 
 printf "static const char *x86_MSRs[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*'
-egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSC_DEADLINE|UCODE_REV)|IDT_FCR4)' | \
+grep -E $regex ${x86_msr_index} | grep -E -v 'MSR_(ATOM|P[46]|IA32_(TSC_DEADLINE|UCODE_REV)|IDT_FCR4)' | \
 	sed -r "s/$regex/\2 \1/g" | sort -n | \
 	xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"
@@ -23,18 +23,18 @@ printf "};\n\n"
 # Remove MSR_K6_WHCR, clashes with MSR_LSTAR
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0xc0000[[:xdigit:]]+)[[:space:]]*.*'
 printf "#define x86_64_specific_MSRs_offset "
-egrep $regex ${x86_msr_index} | sed -r "s/$regex/\2/g" | sort -n | head -1
+grep -E $regex ${x86_msr_index} | sed -r "s/$regex/\2/g" | sort -n | head -1
 printf "static const char *x86_64_specific_MSRs[] = {\n"
-egrep $regex ${x86_msr_index} | \
-	sed -r "s/$regex/\2 \1/g" | egrep -vw 'K6_WHCR' | sort -n | \
+grep -E $regex ${x86_msr_index} | \
+	sed -r "s/$regex/\2 \1/g" | grep -E -vw 'K6_WHCR' | sort -n | \
 	xargs printf "\t[%s - x86_64_specific_MSRs_offset] = \"%s\",\n"
 printf "};\n\n"
 
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0xc0010[[:xdigit:]]+)[[:space:]]*.*'
 printf "#define x86_AMD_V_KVM_MSRs_offset "
-egrep $regex ${x86_msr_index} | sed -r "s/$regex/\2/g" | sort -n | head -1
+grep -E $regex ${x86_msr_index} | sed -r "s/$regex/\2/g" | sort -n | head -1
 printf "static const char *x86_AMD_V_KVM_MSRs[] = {\n"
-egrep $regex ${x86_msr_index} | \
+grep -E $regex ${x86_msr_index} | \
 	sed -r "s/$regex/\2 \1/g" | sort -n | \
 	xargs printf "\t[%s - x86_AMD_V_KVM_MSRs_offset] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/usbdevfs_ioctl.sh b/tools/perf/trace/beauty/usbdevfs_ioctl.sh
index aa597ae53747..b39cfb3720b8 100755
--- a/tools/perf/trace/beauty/usbdevfs_ioctl.sh
+++ b/tools/perf/trace/beauty/usbdevfs_ioctl.sh
@@ -8,14 +8,14 @@
 
 printf "static const char *usbdevfs_ioctl_cmds[] = {\n"
 regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)(\(\w+\))?[[:space:]]+_IO[CWR]{0,2}\([[:space:]]*(_IOC_\w+,[[:space:]]*)?'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*"
-egrep "$regex" ${header_dir}/usbdevice_fs.h | egrep -v 'USBDEVFS_\w+32[[:space:]]' | \
+grep -E "$regex" ${header_dir}/usbdevice_fs.h | grep -E -v 'USBDEVFS_\w+32[[:space:]]' | \
 	sed -r "s/$regex/\4 \1/g"	| \
 	sort | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n\n"
 printf "#if 0\n"
 printf "static const char *usbdevfs_ioctl_32_cmds[] = {\n"
 regex="^#[[:space:]]*define[[:space:]]+USBDEVFS_(\w+)[[:space:]]+_IO[WR]{0,2}\([[:space:]]*'U'[[:space:]]*,[[:space:]]*([[:digit:]]+).*"
-egrep $regex ${header_dir}/usbdevice_fs.h | egrep 'USBDEVFS_\w+32[[:space:]]' | \
+grep -E $regex ${header_dir}/usbdevice_fs.h | grep -E 'USBDEVFS_\w+32[[:space:]]' | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh
index 439773daaf77..2dd0a3b1f55a 100755
--- a/tools/perf/trace/beauty/vhost_virtio_ioctl.sh
+++ b/tools/perf/trace/beauty/vhost_virtio_ioctl.sh
@@ -5,14 +5,14 @@
 
 printf "static const char *vhost_virtio_ioctl_cmds[] = {\n"
 regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*'
-egrep $regex ${header_dir}/vhost.h | \
+grep -E $regex ${header_dir}/vhost.h | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
 
 printf "static const char *vhost_virtio_ioctl_read_cmds[] = {\n"
 regex='^#[[:space:]]*define[[:space:]]+VHOST_(\w+)[[:space:]]+_IOW?R\([[:space:]]*VHOST_VIRTIO[[:space:]]*,[[:space:]]*(0x[[:xdigit:]]+).*'
-egrep $regex ${header_dir}/vhost.h | \
+grep -E $regex ${header_dir}/vhost.h | \
 	sed -r "s/$regex/\2 \1/g"	| \
 	sort | xargs printf "\t[%s] = \"%s\",\n"
 printf "};\n"
diff --git a/tools/perf/trace/beauty/x86_arch_prctl.sh b/tools/perf/trace/beauty/x86_arch_prctl.sh
index 7372d3cab959..57fa6aaffe70 100755
--- a/tools/perf/trace/beauty/x86_arch_prctl.sh
+++ b/tools/perf/trace/beauty/x86_arch_prctl.sh
@@ -15,8 +15,8 @@ print_range () {
 	printf "static const char *x86_arch_prctl_codes_%d[] = {\n" $idx
 	regex=`printf '^[[:space:]]*#[[:space:]]*define[[:space:]]+ARCH_([[:alnum:]_]+)[[:space:]]+(%s[[:xdigit:]]+).*' ${prefix}`
 	fmt="\t[%#x - ${first_entry}]= \"%s\",\n"
-	egrep -q $regex ${prctl_arch_header} && \
-	(egrep $regex ${prctl_arch_header} | \
+	grep -E -q $regex ${prctl_arch_header} && \
+	(grep -E $regex ${prctl_arch_header} | \
 		sed -r "s/$regex/\2 \1/g"	| \
 		xargs printf "$fmt")
 	printf "};\n\n"
diff --git a/tools/perf/ui/util.c b/tools/perf/ui/util.c
index 689b27c34246..1d38ddf01b60 100644
--- a/tools/perf/ui/util.c
+++ b/tools/perf/ui/util.c
@@ -15,6 +15,9 @@ static int perf_stdio__error(const char *format, va_list args)
 
 static int perf_stdio__warning(const char *format, va_list args)
 {
+	if (quiet)
+		return 0;
+
 	fprintf(stderr, "Warning:\n");
 	vfprintf(stderr, format, args);
 	return 0;
@@ -45,6 +48,8 @@ int ui__warning(const char *format, ...)
 {
 	int ret;
 	va_list args;
+	if (quiet)
+		return 0;
 
 	va_start(args, format);
 	ret = perf_eops->warning(format, args);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index e315ecaec323..79b9498886a2 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -19,7 +19,6 @@ perf-y += perf_event_attr_fprintf.o
 perf-y += evswitch.o
 perf-y += find_bit.o
 perf-y += get_current_dir_name.o
-perf-y += kallsyms.o
 perf-y += levenshtein.o
 perf-y += llvm-utils.o
 perf-y += mmap.o
@@ -70,18 +69,19 @@ perf-y += namespaces.o
 perf-y += comm.o
 perf-y += thread.o
 perf-y += thread_map.o
-perf-y += trace-event-parse.o
 perf-y += parse-events-flex.o
 perf-y += parse-events-bison.o
 perf-y += pmu.o
+perf-y += pmus.o
 perf-y += pmu-flex.o
 perf-y += pmu-bison.o
 perf-y += pmu-hybrid.o
-perf-y += trace-event-read.o
-perf-y += trace-event-info.o
-perf-y += trace-event-scripting.o
-perf-y += trace-event.o
 perf-y += svghelper.o
+perf-$(CONFIG_LIBTRACEEVENT) += trace-event-info.o
+perf-$(CONFIG_LIBTRACEEVENT) += trace-event-scripting.o
+perf-$(CONFIG_LIBTRACEEVENT) += trace-event.o
+perf-$(CONFIG_LIBTRACEEVENT) += trace-event-parse.o
+perf-$(CONFIG_LIBTRACEEVENT) += trace-event-read.o
 perf-y += sort.o
 perf-y += hist.o
 perf-y += util.o
@@ -126,6 +126,7 @@ ifdef CONFIG_LIBOPENCSD
 perf-$(CONFIG_AUXTRACE) += cs-etm.o
 perf-$(CONFIG_AUXTRACE) += cs-etm-decoder/
 endif
+perf-$(CONFIG_AUXTRACE) += cs-etm-base.o
 
 perf-y += parse-branch-options.o
 perf-y += dump-insn.o
@@ -153,8 +154,12 @@ perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
-perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_lock_contention.o
+
+ifeq ($(CONFIG_LIBTRACEEVENT),y)
+  perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork.o
+endif
+
 perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
 perf-$(CONFIG_LIBELF) += symbol-elf.o
 perf-$(CONFIG_LIBELF) += probe-file.o
@@ -189,7 +194,10 @@ perf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 perf-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
 perf-$(CONFIG_LIBUNWIND_AARCH64)  += libunwind/arm64.o
 
-perf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o
+ifeq ($(CONFIG_LIBTRACEEVENT),y)
+  perf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o
+endif
+
 perf-y += data-convert-json.o
 
 perf-y += scripting-engines/
@@ -220,7 +228,7 @@ perf-$(CONFIG_CXX) += c++/
 perf-$(CONFIG_LIBPFM4) += pfm.o
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
-CFLAGS_llvm-utils.o += -DPERF_INCLUDE_DIR="BUILD_STR($(perf_include_dir_SQ))"
+CFLAGS_llvm-utils.o += -DLIBBPF_INCLUDE_DIR="BUILD_STR($(libbpf_include_dir_SQ))"
 
 # avoid compiler warnings in 32-bit mode
 CFLAGS_genelf_debug.o  += -Wno-packed
@@ -294,10 +302,6 @@ CFLAGS_expr.o          += -Wno-redundant-decls
 CFLAGS_header.o        += -include $(OUTPUT)PERF-VERSION-FILE
 CFLAGS_arm-spe.o       += -I$(srctree)/tools/arch/arm64/include/
 
-$(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE
-	$(call rule_mkdir)
-	$(call if_changed_dep,cc_o_c)
-
 $(OUTPUT)util/argv_split.o: ../lib/argv_split.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c
index 4ee96b3c755b..38dc4524b7e8 100644
--- a/tools/perf/util/affinity.c
+++ b/tools/perf/util/affinity.c
@@ -58,14 +58,14 @@ void affinity__set(struct affinity *a, int cpu)
 		return;
 
 	a->changed = true;
-	set_bit(cpu, a->sched_cpus);
+	__set_bit(cpu, a->sched_cpus);
 	/*
 	 * We ignore errors because affinity is just an optimization.
 	 * This could happen for example with isolated CPUs or cpusets.
 	 * In this case the IPIs inside the kernel's perf API still work.
 	 */
 	sched_setaffinity(0, cpu_set_size, (cpu_set_t *)a->sched_cpus);
-	clear_bit(cpu, a->sched_cpus);
+	__clear_bit(cpu, a->sched_cpus);
 }
 
 static void __affinity__cleanup(struct affinity *a)
diff --git a/tools/perf/util/amd-sample-raw.c b/tools/perf/util/amd-sample-raw.c
index 238305868644..b0e70ce9d87a 100644
--- a/tools/perf/util/amd-sample-raw.c
+++ b/tools/perf/util/amd-sample-raw.c
@@ -16,6 +16,7 @@
 #include "evlist.h"
 #include "sample-raw.h"
 #include "pmu-events/pmu-events.h"
+#include "util/sample.h"
 
 static u32 cpu_family, cpu_model, ibs_fetch_type, ibs_op_type;
 static bool zen4_ibs_extensions;
diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.h b/tools/perf/util/arm64-frame-pointer-unwind-support.h
index 32af9ce94398..42d3a45490f5 100644
--- a/tools/perf/util/arm64-frame-pointer-unwind-support.h
+++ b/tools/perf/util/arm64-frame-pointer-unwind-support.h
@@ -2,8 +2,10 @@
 #ifndef __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
 #define __PERF_ARM_FRAME_POINTER_UNWIND_SUPPORT_H
 
-#include "event.h"
-#include "thread.h"
+#include <linux/types.h>
+
+struct perf_sample;
+struct thread;
 
 u64 get_leaf_frame_caller_aarch64(struct perf_sample *sample, struct thread *thread, int user_idx);
 
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 46ada5ec3f9a..265d20cc126b 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -59,6 +59,7 @@
 #include <linux/ctype.h>
 #include "symbol/kallsyms.h"
 #include <internal/lib.h>
+#include "util/sample.h"
 
 /*
  * Make a group from 'leader' to 'last', requiring that the events were not
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 6a0f9b98f059..2cf63d377831 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -15,7 +15,7 @@
 #include <linux/list.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
-#include <internal/cpumap.h>
+#include <perf/cpumap.h>
 #include <asm/bitsperlong.h>
 #include <asm/barrier.h>
 
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index f4adeccdbbcb..6e9b06cf06ee 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -27,11 +27,7 @@
 #include "util.h"
 #include "llvm-utils.h"
 #include "c++/clang-c.h"
-#ifdef HAVE_LIBBPF_SUPPORT
-#include <bpf/hashmap.h>
-#else
 #include "util/hashmap.h"
-#endif
 #include "asm/bug.h"
 
 #include <internal/xyarray.h>
@@ -318,7 +314,7 @@ static void bpf_program_hash_free(void)
 		return;
 
 	hashmap__for_each_entry(bpf_program_hash, cur, bkt)
-		clear_prog_priv(cur->key, cur->value);
+		clear_prog_priv(cur->pkey, cur->pvalue);
 
 	hashmap__free(bpf_program_hash);
 	bpf_program_hash = NULL;
@@ -339,13 +335,12 @@ void bpf__clear(void)
 	bpf_map_hash_free();
 }
 
-static size_t ptr_hash(const void *__key, void *ctx __maybe_unused)
+static size_t ptr_hash(const long __key, void *ctx __maybe_unused)
 {
-	return (size_t) __key;
+	return __key;
 }
 
-static bool ptr_equal(const void *key1, const void *key2,
-			  void *ctx __maybe_unused)
+static bool ptr_equal(long key1, long key2, void *ctx __maybe_unused)
 {
 	return key1 == key2;
 }
@@ -1185,7 +1180,7 @@ static void bpf_map_hash_free(void)
 		return;
 
 	hashmap__for_each_entry(bpf_map_hash, cur, bkt)
-		bpf_map_priv__clear(cur->key, cur->value);
+		bpf_map_priv__clear(cur->pkey, cur->pvalue);
 
 	hashmap__free(bpf_map_hash);
 	bpf_map_hash = NULL;
diff --git a/tools/perf/util/bpf-prologue.h b/tools/perf/util/bpf-prologue.h
index c50c7358009f..66dcf751ef65 100644
--- a/tools/perf/util/bpf-prologue.h
+++ b/tools/perf/util/bpf-prologue.h
@@ -6,9 +6,8 @@
 #ifndef __BPF_PROLOGUE_H
 #define __BPF_PROLOGUE_H
 
-#include <linux/compiler.h>
-#include <linux/filter.h>
-#include "probe-event.h"
+struct probe_trace_arg;
+struct bpf_insn;
 
 #define BPF_PROLOGUE_MAX_ARGS 3
 #define BPF_PROLOGUE_START_ARG_REG BPF_REG_3
@@ -19,6 +18,7 @@ int bpf__gen_prologue(struct probe_trace_arg *args, int nargs,
 		      struct bpf_insn *new_prog, size_t *new_cnt,
 		      size_t cnt_space);
 #else
+#include <linux/compiler.h>
 #include <errno.h>
 
 static inline int
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index ef1c15e4aeba..eeee899fcf34 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -561,7 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
 
 		if (filter_type == BPERF_FILTER_PID ||
 		    filter_type == BPERF_FILTER_TGID)
-			key = evsel->core.threads->map[i].pid;
+			key = perf_thread_map__pid(evsel->core.threads, i);
 		else if (filter_type == BPERF_FILTER_CPU)
 			key = evsel->core.cpus->map[i].cpu;
 		else
diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c
index b629dd679d3f..6eb2c78fd7f4 100644
--- a/tools/perf/util/bpf_kwork.c
+++ b/tools/perf/util/bpf_kwork.c
@@ -7,15 +7,18 @@
 
 #include <time.h>
 #include <fcntl.h>
+#include <signal.h>
 #include <stdio.h>
 #include <unistd.h>
 
 #include <linux/time64.h>
 
 #include "util/debug.h"
+#include "util/evsel.h"
 #include "util/kwork.h"
 
 #include <bpf/bpf.h>
+#include <perf/cpumap.h>
 
 #include "util/bpf_skel/kwork_trace.skel.h"
 
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index fc4d613cb979..8e1b791dc58f 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -5,6 +5,7 @@
 #include "util/map.h"
 #include "util/symbol.h"
 #include "util/target.h"
+#include "util/thread.h"
 #include "util/thread_map.h"
 #include "util/lock-contention.h"
 #include <linux/zalloc.h>
@@ -12,17 +13,10 @@
 #include <bpf/bpf.h>
 
 #include "bpf_skel/lock_contention.skel.h"
+#include "bpf_skel/lock_data.h"
 
 static struct lock_contention_bpf *skel;
 
-struct lock_contention_data {
-	u64 total_time;
-	u64 min_time;
-	u64 max_time;
-	u32 count;
-	u32 flags;
-};
-
 int lock_contention_prepare(struct lock_contention *con)
 {
 	int i, fd;
@@ -37,8 +31,16 @@ int lock_contention_prepare(struct lock_contention *con)
 	}
 
 	bpf_map__set_value_size(skel->maps.stacks, con->max_stack * sizeof(u64));
-	bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
 	bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries);
+	bpf_map__set_max_entries(skel->maps.tstamp, con->map_nr_entries);
+
+	if (con->aggr_mode == LOCK_AGGR_TASK) {
+		bpf_map__set_max_entries(skel->maps.task_data, con->map_nr_entries);
+		bpf_map__set_max_entries(skel->maps.stacks, 1);
+	} else {
+		bpf_map__set_max_entries(skel->maps.task_data, 1);
+		bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
+	}
 
 	if (target__has_cpu(target))
 		ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
@@ -88,7 +90,9 @@ int lock_contention_prepare(struct lock_contention *con)
 		bpf_map_update_elem(fd, &pid, &val, BPF_ANY);
 	}
 
+	/* these don't work well if in the rodata section */
 	skel->bss->stack_skip = con->stack_skip;
+	skel->bss->aggr_mode = con->aggr_mode;
 
 	lock_contention_bpf__attach(skel);
 	return 0;
@@ -108,28 +112,48 @@ int lock_contention_stop(void)
 
 int lock_contention_read(struct lock_contention *con)
 {
-	int fd, stack;
-	s32 prev_key, key;
-	struct lock_contention_data data;
-	struct lock_stat *st;
+	int fd, stack, task_fd, err = 0;
+	struct contention_key *prev_key, key;
+	struct contention_data data = {};
+	struct lock_stat *st = NULL;
 	struct machine *machine = con->machine;
-	u64 stack_trace[con->max_stack];
+	u64 *stack_trace;
+	size_t stack_size = con->max_stack * sizeof(*stack_trace);
 
 	fd = bpf_map__fd(skel->maps.lock_stat);
 	stack = bpf_map__fd(skel->maps.stacks);
+	task_fd = bpf_map__fd(skel->maps.task_data);
 
 	con->lost = skel->bss->lost;
 
-	prev_key = 0;
-	while (!bpf_map_get_next_key(fd, &prev_key, &key)) {
+	stack_trace = zalloc(stack_size);
+	if (stack_trace == NULL)
+		return -1;
+
+	if (con->aggr_mode == LOCK_AGGR_TASK) {
+		struct thread *idle = __machine__findnew_thread(machine,
+								/*pid=*/0,
+								/*tid=*/0);
+		thread__set_comm(idle, "swapper", /*timestamp=*/0);
+	}
+
+	/* make sure it loads the kernel map */
+	map__load(maps__first(machine->kmaps));
+
+	prev_key = NULL;
+	while (!bpf_map_get_next_key(fd, prev_key, &key)) {
 		struct map *kmap;
 		struct symbol *sym;
 		int idx = 0;
+		s32 stack_id;
+
+		/* to handle errors in the loop body */
+		err = -1;
 
 		bpf_map_lookup_elem(fd, &key, &data);
 		st = zalloc(sizeof(*st));
 		if (st == NULL)
-			return -1;
+			break;
 
 		st->nr_contended = data.count;
 		st->wait_time_total = data.total_time;
@@ -140,11 +164,34 @@ int lock_contention_read(struct lock_contention *con)
 			st->avg_wait_time = data.total_time / data.count;
 
 		st->flags = data.flags;
+		st->addr = key.aggr_key;
+
+		if (con->aggr_mode == LOCK_AGGR_TASK) {
+			struct contention_task_data task;
+			struct thread *t;
+			int pid = key.aggr_key;
+
+			/* do not update idle comm which contains CPU number */
+			if (st->addr) {
+				bpf_map_lookup_elem(task_fd, &pid, &task);
+				t = __machine__findnew_thread(machine, /*pid=*/-1, pid);
+				thread__set_comm(t, task.comm, /*timestamp=*/0);
+			}
+			goto next;
+		}
 
-		bpf_map_lookup_elem(stack, &key, stack_trace);
+		if (con->aggr_mode == LOCK_AGGR_ADDR) {
+			sym = machine__find_kernel_symbol(machine, st->addr, &kmap);
+			if (sym)
+				st->name = strdup(sym->name);
+			goto next;
+		}
+
+		stack_id = key.aggr_key;
+		bpf_map_lookup_elem(stack, &stack_id, stack_trace);
 
 		/* skip lock internal functions */
-		while (is_lock_function(machine, stack_trace[idx]) &&
+		while (machine__is_lock_function(machine, stack_trace[idx]) &&
 		       idx < con->max_stack - 1)
 			idx++;
 
@@ -163,25 +210,32 @@ int lock_contention_read(struct lock_contention *con)
 				st->name = strdup(sym->name);
 
 			if (ret < 0 || st->name == NULL)
-				return -1;
+				break;
 		} else if (asprintf(&st->name, "%#lx", (unsigned long)st->addr) < 0) {
-			free(st);
-			return -1;
+			break;
 		}
 
 		if (verbose) {
-			st->callstack = memdup(stack_trace, sizeof(stack_trace));
-			if (st->callstack == NULL) {
-				free(st);
-				return -1;
-			}
+			st->callstack = memdup(stack_trace, stack_size);
+			if (st->callstack == NULL)
+				break;
 		}
-
+next:
 		hlist_add_head(&st->hash_entry, con->result);
-		prev_key = key;
+		prev_key = &key;
+
+		/* we're fine now, reset the values */
+		st = NULL;
+		err = 0;
 	}
 
-	return 0;
+	free(stack_trace);
+	if (st) {
+		free(st->name);
+		free(st);
+	}
+
+	return err;
 }
 
 int lock_contention_finish(void)
diff --git a/tools/perf/util/bpf_map.h b/tools/perf/util/bpf_map.h
index d6abd5e47af8..c2f7c13cba23 100644
--- a/tools/perf/util/bpf_map.h
+++ b/tools/perf/util/bpf_map.h
@@ -3,7 +3,6 @@
 #define __PERF_BPF_MAP_H 1
 
 #include <stdio.h>
-#include <linux/compiler.h>
 struct bpf_map;
 
 #ifdef HAVE_LIBBPF_SUPPORT
@@ -12,6 +11,8 @@ int bpf_map__fprintf(struct bpf_map *map, FILE *fp);
 
 #else
 
+#include <linux/compiler.h>
+
 static inline int bpf_map__fprintf(struct bpf_map *map __maybe_unused, FILE *fp __maybe_unused)
 {
 	return 0;
diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index c257813e674e..01f70b8e705a 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -102,7 +102,7 @@ static void check_sched_switch_args(void)
 	const struct btf_type *t1, *t2, *t3;
 	u32 type_id;
 
-	type_id = btf__find_by_name_kind(btf, "bpf_trace_sched_switch",
+	type_id = btf__find_by_name_kind(btf, "btf_trace_sched_switch",
 					 BTF_KIND_TYPEDEF);
 	if ((s32)type_id < 0)
 		return;
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index 1bb8628e7c9f..11b0fc7ee53b 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -5,24 +5,11 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
 
-/* maximum stack trace depth */
-#define MAX_STACKS   8
+#include "lock_data.h"
 
 /* default buffer size */
 #define MAX_ENTRIES  10240
 
-struct contention_key {
-	__s32 stack_id;
-};
-
-struct contention_data {
-	__u64 total_time;
-	__u64 min_time;
-	__u64 max_time;
-	__u32 count;
-	__u32 flags;
-};
-
 struct tstamp_data {
 	__u64 timestamp;
 	__u64 lock;
@@ -34,16 +21,16 @@ struct tstamp_data {
 struct {
 	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
 	__uint(key_size, sizeof(__u32));
-	__uint(value_size, MAX_STACKS * sizeof(__u64));
+	__uint(value_size, sizeof(__u64));
 	__uint(max_entries, MAX_ENTRIES);
 } stacks SEC(".maps");
 
 /* maintain timestamp at the beginning of contention */
 struct {
-	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__uint(type, BPF_MAP_TYPE_HASH);
 	__type(key, int);
 	__type(value, struct tstamp_data);
+	__uint(max_entries, MAX_ENTRIES);
 } tstamp SEC(".maps");
 
 /* actual lock contention statistics */
@@ -57,6 +44,13 @@ struct {
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
 	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct contention_task_data));
+	__uint(max_entries, MAX_ENTRIES);
+} task_data SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
 	__uint(value_size, sizeof(__u8));
 	__uint(max_entries, 1);
 } cpu_filter SEC(".maps");
@@ -74,6 +68,9 @@ int has_cpu;
 int has_task;
 int stack_skip;
 
+/* determine the key of lock stat */
+int aggr_mode;
+
 /* error stat */
 int lost;
 
@@ -100,35 +97,62 @@ static inline int can_record(void)
 	return 1;
 }
 
+static inline void update_task_data(__u32 pid)
+{
+	struct contention_task_data *p;
+
+	p = bpf_map_lookup_elem(&task_data, &pid);
+	if (p == NULL) {
+		struct contention_task_data data;
+
+		bpf_get_current_comm(data.comm, sizeof(data.comm));
+		bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST);
+	}
+}
+
 SEC("tp_btf/contention_begin")
 int contention_begin(u64 *ctx)
 {
-	struct task_struct *curr;
+	__u32 pid;
 	struct tstamp_data *pelem;
 
 	if (!enabled || !can_record())
 		return 0;
 
-	curr = bpf_get_current_task_btf();
-	pelem = bpf_task_storage_get(&tstamp, curr, NULL,
-				     BPF_LOCAL_STORAGE_GET_F_CREATE);
-	if (!pelem || pelem->lock)
+	pid = bpf_get_current_pid_tgid();
+	pelem = bpf_map_lookup_elem(&tstamp, &pid);
+	if (pelem && pelem->lock)
 		return 0;
 
+	if (pelem == NULL) {
+		struct tstamp_data zero = {};
+
+		bpf_map_update_elem(&tstamp, &pid, &zero, BPF_ANY);
+		pelem = bpf_map_lookup_elem(&tstamp, &pid);
+		if (pelem == NULL) {
+			lost++;
+			return 0;
+		}
+	}
+
 	pelem->timestamp = bpf_ktime_get_ns();
 	pelem->lock = (__u64)ctx[0];
 	pelem->flags = (__u32)ctx[1];
-	pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | stack_skip);
 
-	if (pelem->stack_id < 0)
-		lost++;
+	if (aggr_mode == LOCK_AGGR_CALLER) {
+		pelem->stack_id = bpf_get_stackid(ctx, &stacks,
+						  BPF_F_FAST_STACK_CMP | stack_skip);
+		if (pelem->stack_id < 0)
+			lost++;
+	}
+
 	return 0;
 }
 
 SEC("tp_btf/contention_end")
 int contention_end(u64 *ctx)
 {
-	struct task_struct *curr;
+	__u32 pid;
 	struct tstamp_data *pelem;
 	struct contention_key key;
 	struct contention_data *data;
@@ -137,14 +161,29 @@ int contention_end(u64 *ctx)
 	if (!enabled)
 		return 0;
 
-	curr = bpf_get_current_task_btf();
-	pelem = bpf_task_storage_get(&tstamp, curr, NULL, 0);
+	pid = bpf_get_current_pid_tgid();
+	pelem = bpf_map_lookup_elem(&tstamp, &pid);
 	if (!pelem || pelem->lock != ctx[0])
 		return 0;
 
 	duration = bpf_ktime_get_ns() - pelem->timestamp;
 
-	key.stack_id = pelem->stack_id;
+	switch (aggr_mode) {
+	case LOCK_AGGR_CALLER:
+		key.aggr_key = pelem->stack_id;
+		break;
+	case LOCK_AGGR_TASK:
+		key.aggr_key = pid;
+		update_task_data(pid);
+		break;
+	case LOCK_AGGR_ADDR:
+		key.aggr_key = pelem->lock;
+		break;
+	default:
+		/* should not happen */
+		return 0;
+	}
+
 	data = bpf_map_lookup_elem(&lock_stat, &key);
 	if (!data) {
 		struct contention_data first = {
@@ -156,7 +195,7 @@ int contention_end(u64 *ctx)
 		};
 
 		bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
-		pelem->lock = 0;
+		bpf_map_delete_elem(&tstamp, &pid);
 		return 0;
 	}
 
@@ -169,7 +208,7 @@ int contention_end(u64 *ctx)
 	if (data->min_time > duration)
 		data->min_time = duration;
 
-	pelem->lock = 0;
+	bpf_map_delete_elem(&tstamp, &pid);
 	return 0;
 }
 
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
new file mode 100644
index 000000000000..ce71cf1a7e1e
--- /dev/null
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Data structures shared between BPF and tools. */
+#ifndef UTIL_BPF_SKEL_LOCK_DATA_H
+#define UTIL_BPF_SKEL_LOCK_DATA_H
+
+struct contention_key {
+	u64 aggr_key;  /* can be stack_id, pid or lock addr */
+};
+
+#define TASK_COMM_LEN  16
+
+struct contention_task_data {
+	char comm[TASK_COMM_LEN];
+};
+
+struct contention_data {
+	u64 total_time;
+	u64 min_time;
+	u64 max_time;
+	u32 count;
+	u32 flags;
+};
+
+enum lock_aggr_mode {
+	LOCK_AGGR_ADDR = 0,
+	LOCK_AGGR_TASK,
+	LOCK_AGGR_CALLER,
+};
+
+#endif /* UTIL_BPF_SKEL_LOCK_DATA_H */
diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h
index f838b23db180..3ed792db1125 100644
--- a/tools/perf/util/branch.h
+++ b/tools/perf/util/branch.h
@@ -7,12 +7,10 @@
  * detected in at least musl libc, used in Alpine Linux. -acme
  */
 #include <stdio.h>
-#include <stdint.h>
-#include <linux/compiler.h>
-#include <linux/stddef.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
-#include "event.h"
+#include "util/map_symbol.h"
+#include "util/sample.h"
 
 struct branch_flags {
 	union {
@@ -24,9 +22,10 @@ struct branch_flags {
 			u64 abort:1;
 			u64 cycles:16;
 			u64 type:4;
+			u64 spec:2;
 			u64 new_type:4;
 			u64 priv:3;
-			u64 reserved:33;
+			u64 reserved:31;
 		};
 	};
 };
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 3f2ae19a1dd4..658170b8dcef 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -556,7 +556,7 @@ static char *home_perfconfig(void)
 
 	config = strdup(mkpath("%s/.perfconfig", home));
 	if (config == NULL) {
-		pr_warning("Not enough memory to process %s/.perfconfig, ignoring it.", home);
+		pr_warning("Not enough memory to process %s/.perfconfig, ignoring it.\n", home);
 		return NULL;
 	}
 
@@ -564,7 +564,7 @@ static char *home_perfconfig(void)
 		goto out_free;
 
 	if (st.st_uid && (st.st_uid != geteuid())) {
-		pr_warning("File %s not owned by current user or root, ignoring it.", config);
+		pr_warning("File %s not owned by current user or root, ignoring it.\n", config);
 		goto out_free;
 	}
 
diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c
index 7a447d918458..11cd85b278a6 100644
--- a/tools/perf/util/counts.c
+++ b/tools/perf/util/counts.c
@@ -48,7 +48,6 @@ void perf_counts__reset(struct perf_counts *counts)
 {
 	xyarray__reset(counts->loaded);
 	xyarray__reset(counts->values);
-	memset(&counts->aggr, 0, sizeof(struct perf_counts_values));
 }
 
 void evsel__reset_counts(struct evsel *evsel)
diff --git a/tools/perf/util/counts.h b/tools/perf/util/counts.h
index 5de275194f2b..42760242e0df 100644
--- a/tools/perf/util/counts.h
+++ b/tools/perf/util/counts.h
@@ -11,7 +11,6 @@ struct evsel;
 
 struct perf_counts {
 	s8			  scaled;
-	struct perf_counts_values aggr;
 	struct xyarray		  *values;
 	struct xyarray		  *loaded;
 };
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 8486ca3bec75..5e564974fba4 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -12,6 +12,7 @@
 
 #include <linux/ctype.h>
 #include <linux/zalloc.h>
+#include <internal/cpumap.h>
 
 static struct perf_cpu max_cpu_num;
 static struct perf_cpu max_present_cpu_num;
@@ -234,7 +235,7 @@ static int aggr_cpu_id__cmp(const void *a_pointer, const void *b_pointer)
 
 struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 				       aggr_cpu_id_get_t get_id,
-				       void *data)
+				       void *data, bool needs_sort)
 {
 	int idx;
 	struct perf_cpu cpu;
@@ -270,8 +271,10 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 		if (trimmed_c)
 			c = trimmed_c;
 	}
+
 	/* ensure we process id in increasing order */
-	qsort(c->map, c->nr, sizeof(struct aggr_cpu_id), aggr_cpu_id__cmp);
+	if (needs_sort)
+		qsort(c->map, c->nr, sizeof(struct aggr_cpu_id), aggr_cpu_id__cmp);
 
 	return c;
 
@@ -354,6 +357,16 @@ struct aggr_cpu_id aggr_cpu_id__node(struct perf_cpu cpu, void *data __maybe_unu
 	return id;
 }
 
+struct aggr_cpu_id aggr_cpu_id__global(struct perf_cpu cpu, void *data __maybe_unused)
+{
+	struct aggr_cpu_id id = aggr_cpu_id__empty();
+
+	/* it always aggregates to the cpu 0 */
+	cpu.cpu = 0;
+	id.cpu = cpu;
+	return id;
+}
+
 /* setup simple routines to easily access node numbers given a cpu number */
 static int get_max_num(char *path, int *max)
 {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 4a6d029576ee..c2f5824a3a22 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -4,8 +4,8 @@
 
 #include <stdbool.h>
 #include <stdio.h>
-#include <internal/cpumap.h>
 #include <perf/cpumap.h>
+#include <linux/refcount.h>
 
 /** Identify where counts are aggregated, -1 implies not to aggregate. */
 struct aggr_cpu_id {
@@ -97,7 +97,7 @@ typedef struct aggr_cpu_id (*aggr_cpu_id_get_t)(struct perf_cpu cpu, void *data)
  */
 struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 				       aggr_cpu_id_get_t get_id,
-				       void *data);
+				       void *data, bool needs_sort);
 
 bool aggr_cpu_id__equal(const struct aggr_cpu_id *a, const struct aggr_cpu_id *b);
 bool aggr_cpu_id__is_empty(const struct aggr_cpu_id *a);
@@ -133,5 +133,9 @@ struct aggr_cpu_id aggr_cpu_id__cpu(struct perf_cpu cpu, void *data);
  * cpu. The function signature is compatible with aggr_cpu_id_get_t.
  */
 struct aggr_cpu_id aggr_cpu_id__node(struct perf_cpu cpu, void *data);
-
+/**
+ * aggr_cpu_id__global - Create an aggr_cpu_id for global aggregation.
+ * The function signature is compatible with aggr_cpu_id_get_t.
+ */
+struct aggr_cpu_id aggr_cpu_id__global(struct perf_cpu cpu, void *data);
 #endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/cs-etm-base.c b/tools/perf/util/cs-etm-base.c
new file mode 100644
index 000000000000..597542410854
--- /dev/null
+++ b/tools/perf/util/cs-etm-base.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * File for any parts of the Coresight decoding that don't require
+ * OpenCSD.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+
+#include "cs-etm.h"
+
+static const char * const cs_etm_global_header_fmts[] = {
+	[CS_HEADER_VERSION]	= "	Header version		       %llx\n",
+	[CS_PMU_TYPE_CPUS]	= "	PMU type/num cpus	       %llx\n",
+	[CS_ETM_SNAPSHOT]	= "	Snapshot		       %llx\n",
+};
+
+static const char * const cs_etm_priv_fmts[] = {
+	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
+	[CS_ETM_CPU]		= "	CPU			       %lld\n",
+	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
+	[CS_ETM_ETMCR]		= "	ETMCR			       %llx\n",
+	[CS_ETM_ETMTRACEIDR]	= "	ETMTRACEIDR		       %llx\n",
+	[CS_ETM_ETMCCER]	= "	ETMCCER			       %llx\n",
+	[CS_ETM_ETMIDR]		= "	ETMIDR			       %llx\n",
+};
+
+static const char * const cs_etmv4_priv_fmts[] = {
+	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
+	[CS_ETM_CPU]		= "	CPU			       %lld\n",
+	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
+	[CS_ETMV4_TRCCONFIGR]	= "	TRCCONFIGR		       %llx\n",
+	[CS_ETMV4_TRCTRACEIDR]	= "	TRCTRACEIDR		       %llx\n",
+	[CS_ETMV4_TRCIDR0]	= "	TRCIDR0			       %llx\n",
+	[CS_ETMV4_TRCIDR1]	= "	TRCIDR1			       %llx\n",
+	[CS_ETMV4_TRCIDR2]	= "	TRCIDR2			       %llx\n",
+	[CS_ETMV4_TRCIDR8]	= "	TRCIDR8			       %llx\n",
+	[CS_ETMV4_TRCAUTHSTATUS] = "	TRCAUTHSTATUS		       %llx\n",
+	[CS_ETE_TRCDEVARCH]	= "	TRCDEVARCH                     %llx\n"
+};
+
+static const char * const param_unk_fmt =
+	"	Unknown parameter [%d]	       %"PRIx64"\n";
+static const char * const magic_unk_fmt =
+	"	Magic number Unknown	       %"PRIx64"\n";
+
+static int cs_etm__print_cpu_metadata_v0(u64 *val, int *offset)
+{
+	int i = *offset, j, nr_params = 0, fmt_offset;
+	u64 magic;
+
+	/* check magic value */
+	magic = val[i + CS_ETM_MAGIC];
+	if ((magic != __perf_cs_etmv3_magic) &&
+	    (magic != __perf_cs_etmv4_magic)) {
+		/* failure - note bad magic value */
+		fprintf(stdout, magic_unk_fmt, magic);
+		return -EINVAL;
+	}
+
+	/* print common header block */
+	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_MAGIC], val[i++]);
+	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_CPU], val[i++]);
+
+	if (magic == __perf_cs_etmv3_magic) {
+		nr_params = CS_ETM_NR_TRC_PARAMS_V0;
+		fmt_offset = CS_ETM_ETMCR;
+		/* after common block, offset format index past NR_PARAMS */
+		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+			fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
+	} else if (magic == __perf_cs_etmv4_magic) {
+		nr_params = CS_ETMV4_NR_TRC_PARAMS_V0;
+		fmt_offset = CS_ETMV4_TRCCONFIGR;
+		/* after common block, offset format index past NR_PARAMS */
+		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+			fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
+	}
+	*offset = i;
+	return 0;
+}
+
+static int cs_etm__print_cpu_metadata_v1(u64 *val, int *offset)
+{
+	int i = *offset, j, total_params = 0;
+	u64 magic;
+
+	magic = val[i + CS_ETM_MAGIC];
+	/* total params to print is NR_PARAMS + common block size for v1 */
+	total_params = val[i + CS_ETM_NR_TRC_PARAMS] + CS_ETM_COMMON_BLK_MAX_V1;
+
+	if (magic == __perf_cs_etmv3_magic) {
+		for (j = 0; j < total_params; j++, i++) {
+			/* if newer record - could be excess params */
+			if (j >= CS_ETM_PRIV_MAX)
+				fprintf(stdout, param_unk_fmt, j, val[i]);
+			else
+				fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
+		}
+	} else if (magic == __perf_cs_etmv4_magic || magic == __perf_cs_ete_magic) {
+		/*
+		 * ETE and ETMv4 can be printed in the same block because the number of parameters
+		 * is saved and they share the list of parameter names. ETE is also only supported
+		 * in V1 files.
+		 */
+		for (j = 0; j < total_params; j++, i++) {
+			/* if newer record - could be excess params */
+			if (j >= CS_ETE_PRIV_MAX)
+				fprintf(stdout, param_unk_fmt, j, val[i]);
+			else
+				fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
+		}
+	} else {
+		/* failure - note bad magic value and error out */
+		fprintf(stdout, magic_unk_fmt, magic);
+		return -EINVAL;
+	}
+	*offset = i;
+	return 0;
+}
+
+static void cs_etm__print_auxtrace_info(u64 *val, int num)
+{
+	int i, cpu = 0, version, err;
+
+	version = val[0];
+
+	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
+		fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+
+	for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) {
+		if (version == 0)
+			err = cs_etm__print_cpu_metadata_v0(val, &i);
+		else if (version == 1)
+			err = cs_etm__print_cpu_metadata_v1(val, &i);
+		if (err)
+			return;
+	}
+}
+
+/*
+ * Do some basic checks and print the auxtrace info header before calling
+ * into cs_etm__process_auxtrace_info_full() which requires OpenCSD to be
+ * linked in. This allows some basic debugging if OpenCSD is missing.
+ */
+int cs_etm__process_auxtrace_info(union perf_event *event,
+				  struct perf_session *session)
+{
+	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
+	int event_header_size = sizeof(struct perf_event_header);
+	int num_cpu;
+	u64 *ptr = NULL;
+	u64 hdr_version;
+
+	if (auxtrace_info->header.size < (event_header_size + INFO_HEADER_SIZE))
+		return -EINVAL;
+
+	/* First the global part */
+	ptr = (u64 *) auxtrace_info->priv;
+
+	/* Look for version of the header */
+	hdr_version = ptr[0];
+	if (hdr_version > CS_HEADER_CURRENT_VERSION) {
+		pr_err("\nCS ETM Trace: Unknown Header Version = %#" PRIx64, hdr_version);
+		pr_err(", version supported <= %x\n", CS_HEADER_CURRENT_VERSION);
+		return -EINVAL;
+	}
+
+	if (dump_trace) {
+		num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff;
+		cs_etm__print_auxtrace_info(ptr, num_cpu);
+	}
+
+	return cs_etm__process_auxtrace_info_full(event, session);
+}
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 16db965ac995..33303d03c2fa 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -2510,141 +2510,6 @@ static bool cs_etm__is_timeless_decoding(struct cs_etm_auxtrace *etm)
 	return timeless_decoding;
 }
 
-static const char * const cs_etm_global_header_fmts[] = {
-	[CS_HEADER_VERSION]	= "	Header version		       %llx\n",
-	[CS_PMU_TYPE_CPUS]	= "	PMU type/num cpus	       %llx\n",
-	[CS_ETM_SNAPSHOT]	= "	Snapshot		       %llx\n",
-};
-
-static const char * const cs_etm_priv_fmts[] = {
-	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
-	[CS_ETM_CPU]		= "	CPU			       %lld\n",
-	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
-	[CS_ETM_ETMCR]		= "	ETMCR			       %llx\n",
-	[CS_ETM_ETMTRACEIDR]	= "	ETMTRACEIDR		       %llx\n",
-	[CS_ETM_ETMCCER]	= "	ETMCCER			       %llx\n",
-	[CS_ETM_ETMIDR]		= "	ETMIDR			       %llx\n",
-};
-
-static const char * const cs_etmv4_priv_fmts[] = {
-	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
-	[CS_ETM_CPU]		= "	CPU			       %lld\n",
-	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
-	[CS_ETMV4_TRCCONFIGR]	= "	TRCCONFIGR		       %llx\n",
-	[CS_ETMV4_TRCTRACEIDR]	= "	TRCTRACEIDR		       %llx\n",
-	[CS_ETMV4_TRCIDR0]	= "	TRCIDR0			       %llx\n",
-	[CS_ETMV4_TRCIDR1]	= "	TRCIDR1			       %llx\n",
-	[CS_ETMV4_TRCIDR2]	= "	TRCIDR2			       %llx\n",
-	[CS_ETMV4_TRCIDR8]	= "	TRCIDR8			       %llx\n",
-	[CS_ETMV4_TRCAUTHSTATUS] = "	TRCAUTHSTATUS		       %llx\n",
-	[CS_ETE_TRCDEVARCH]	= "	TRCDEVARCH                     %llx\n"
-};
-
-static const char * const param_unk_fmt =
-	"	Unknown parameter [%d]	       %llx\n";
-static const char * const magic_unk_fmt =
-	"	Magic number Unknown	       %llx\n";
-
-static int cs_etm__print_cpu_metadata_v0(__u64 *val, int *offset)
-{
-	int i = *offset, j, nr_params = 0, fmt_offset;
-	__u64 magic;
-
-	/* check magic value */
-	magic = val[i + CS_ETM_MAGIC];
-	if ((magic != __perf_cs_etmv3_magic) &&
-	    (magic != __perf_cs_etmv4_magic)) {
-		/* failure - note bad magic value */
-		fprintf(stdout, magic_unk_fmt, magic);
-		return -EINVAL;
-	}
-
-	/* print common header block */
-	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_MAGIC], val[i++]);
-	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_CPU], val[i++]);
-
-	if (magic == __perf_cs_etmv3_magic) {
-		nr_params = CS_ETM_NR_TRC_PARAMS_V0;
-		fmt_offset = CS_ETM_ETMCR;
-		/* after common block, offset format index past NR_PARAMS */
-		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
-			fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
-	} else if (magic == __perf_cs_etmv4_magic) {
-		nr_params = CS_ETMV4_NR_TRC_PARAMS_V0;
-		fmt_offset = CS_ETMV4_TRCCONFIGR;
-		/* after common block, offset format index past NR_PARAMS */
-		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
-			fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
-	}
-	*offset = i;
-	return 0;
-}
-
-static int cs_etm__print_cpu_metadata_v1(__u64 *val, int *offset)
-{
-	int i = *offset, j, total_params = 0;
-	__u64 magic;
-
-	magic = val[i + CS_ETM_MAGIC];
-	/* total params to print is NR_PARAMS + common block size for v1 */
-	total_params = val[i + CS_ETM_NR_TRC_PARAMS] + CS_ETM_COMMON_BLK_MAX_V1;
-
-	if (magic == __perf_cs_etmv3_magic) {
-		for (j = 0; j < total_params; j++, i++) {
-			/* if newer record - could be excess params */
-			if (j >= CS_ETM_PRIV_MAX)
-				fprintf(stdout, param_unk_fmt, j, val[i]);
-			else
-				fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
-		}
-	} else if (magic == __perf_cs_etmv4_magic || magic == __perf_cs_ete_magic) {
-		/*
-		 * ETE and ETMv4 can be printed in the same block because the number of parameters
-		 * is saved and they share the list of parameter names. ETE is also only supported
-		 * in V1 files.
-		 */
-		for (j = 0; j < total_params; j++, i++) {
-			/* if newer record - could be excess params */
-			if (j >= CS_ETE_PRIV_MAX)
-				fprintf(stdout, param_unk_fmt, j, val[i]);
-			else
-				fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
-		}
-	} else {
-		/* failure - note bad magic value and error out */
-		fprintf(stdout, magic_unk_fmt, magic);
-		return -EINVAL;
-	}
-	*offset = i;
-	return 0;
-}
-
-static void cs_etm__print_auxtrace_info(__u64 *val, int num)
-{
-	int i, cpu = 0, version, err;
-
-	/* bail out early on bad header version */
-	version = val[0];
-	if (version > CS_HEADER_CURRENT_VERSION) {
-		/* failure.. return */
-		fprintf(stdout, "	Unknown Header Version = %x, ", version);
-		fprintf(stdout, "Version supported <= %x\n", CS_HEADER_CURRENT_VERSION);
-		return;
-	}
-
-	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
-		fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
-
-	for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) {
-		if (version == 0)
-			err = cs_etm__print_cpu_metadata_v0(val, &i);
-		else if (version == 1)
-			err = cs_etm__print_cpu_metadata_v1(val, &i);
-		if (err)
-			return;
-	}
-}
-
 /*
  * Read a single cpu parameter block from the auxtrace_info priv block.
  *
@@ -2881,57 +2746,20 @@ static int cs_etm__queue_aux_records(struct perf_session *session)
 	return 0;
 }
 
-int cs_etm__process_auxtrace_info(union perf_event *event,
-				  struct perf_session *session)
+int cs_etm__process_auxtrace_info_full(union perf_event *event,
+				       struct perf_session *session)
 {
 	struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
 	struct cs_etm_auxtrace *etm = NULL;
 	struct int_node *inode;
-	unsigned int pmu_type;
 	int event_header_size = sizeof(struct perf_event_header);
-	int info_header_size;
 	int total_size = auxtrace_info->header.size;
 	int priv_size = 0;
 	int num_cpu, trcidr_idx;
 	int err = 0;
 	int i, j;
-	u64 *ptr, *hdr = NULL;
+	u64 *ptr = NULL;
 	u64 **metadata = NULL;
-	u64 hdr_version;
-
-	/*
-	 * sizeof(auxtrace_info_event::type) +
-	 * sizeof(auxtrace_info_event::reserved) == 8
-	 */
-	info_header_size = 8;
-
-	if (total_size < (event_header_size + info_header_size))
-		return -EINVAL;
-
-	priv_size = total_size - event_header_size - info_header_size;
-
-	/* First the global part */
-	ptr = (u64 *) auxtrace_info->priv;
-
-	/* Look for version of the header */
-	hdr_version = ptr[0];
-	if (hdr_version > CS_HEADER_CURRENT_VERSION) {
-		/* print routine will print an error on bad version */
-		if (dump_trace)
-			cs_etm__print_auxtrace_info(auxtrace_info->priv, 0);
-		return -EINVAL;
-	}
-
-	hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_MAX);
-	if (!hdr)
-		return -ENOMEM;
-
-	/* Extract header information - see cs-etm.h for format */
-	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
-		hdr[i] = ptr[i];
-	num_cpu = hdr[CS_PMU_TYPE_CPUS] & 0xffffffff;
-	pmu_type = (unsigned int) ((hdr[CS_PMU_TYPE_CPUS] >> 32) &
-				    0xffffffff);
 
 	/*
 	 * Create an RB tree for traceID-metadata tuple.  Since the conversion
@@ -2939,17 +2767,21 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	 * in anything other than a sequential array is worth doing.
 	 */
 	traceid_list = intlist__new(NULL);
-	if (!traceid_list) {
-		err = -ENOMEM;
-		goto err_free_hdr;
-	}
+	if (!traceid_list)
+		return -ENOMEM;
 
+	/* First the global part */
+	ptr = (u64 *) auxtrace_info->priv;
+	num_cpu = ptr[CS_PMU_TYPE_CPUS] & 0xffffffff;
 	metadata = zalloc(sizeof(*metadata) * num_cpu);
 	if (!metadata) {
 		err = -ENOMEM;
 		goto err_free_traceid_list;
 	}
 
+	/* Start parsing after the common part of the header */
+	i = CS_HEADER_VERSION_MAX;
+
 	/*
 	 * The metadata is stored in the auxtrace_info section and encodes
 	 * the configuration of the ARM embedded trace macrocell which is
@@ -3019,6 +2851,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	 * The following tests if the correct number of double words was
 	 * present in the auxtrace info section.
 	 */
+	priv_size = total_size - event_header_size - INFO_HEADER_SIZE;
 	if (i * 8 != priv_size) {
 		err = -EINVAL;
 		goto err_free_metadata;
@@ -3047,8 +2880,8 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	etm->machine = &session->machines.host;
 
 	etm->num_cpu = num_cpu;
-	etm->pmu_type = pmu_type;
-	etm->snapshot_mode = (hdr[CS_ETM_SNAPSHOT] != 0);
+	etm->pmu_type = (unsigned int) ((ptr[CS_PMU_TYPE_CPUS] >> 32) & 0xffffffff);
+	etm->snapshot_mode = (ptr[CS_ETM_SNAPSHOT] != 0);
 	etm->metadata = metadata;
 	etm->auxtrace_type = auxtrace_info->type;
 	etm->timeless_decoding = cs_etm__is_timeless_decoding(etm);
@@ -3082,10 +2915,6 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 		goto err_delete_thread;
 	}
 
-	if (dump_trace) {
-		cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu);
-	}
-
 	err = cs_etm__synth_events(etm, session);
 	if (err)
 		goto err_delete_thread;
@@ -3119,14 +2948,5 @@ err_free_metadata:
 	zfree(&metadata);
 err_free_traceid_list:
 	intlist__delete(traceid_list);
-err_free_hdr:
-	zfree(&hdr);
-	/*
-	 * At this point, as a minimum we have valid header. Dump the rest of
-	 * the info section - the print routines will error out on structural
-	 * issues.
-	 */
-	if (dump_trace)
-		cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu);
 	return err;
 }
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 90c83f932d9a..5da50d5dae6b 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -7,6 +7,7 @@
 #ifndef INCLUDE__UTIL_PERF_CS_ETM_H__
 #define INCLUDE__UTIL_PERF_CS_ETM_H__
 
+#include "debug.h"
 #include "util/event.h"
 #include <linux/bits.h>
 
@@ -201,9 +202,13 @@ struct cs_etm_packet_queue {
 #define CS_ETMV4_PRIV_SIZE (CS_ETMV4_PRIV_MAX * sizeof(u64))
 #define CS_ETE_PRIV_SIZE (CS_ETE_PRIV_MAX * sizeof(u64))
 
-#ifdef HAVE_CSTRACE_SUPPORT
+#define INFO_HEADER_SIZE (sizeof(((struct perf_record_auxtrace_info *)0)->type) + \
+			  sizeof(((struct perf_record_auxtrace_info *)0)->reserved__))
+
 int cs_etm__process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session);
+
+#ifdef HAVE_CSTRACE_SUPPORT
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
 int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
@@ -213,45 +218,16 @@ void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
 					      u8 trace_chan_id);
 struct cs_etm_packet_queue
 *cs_etm__etmq_get_packet_queue(struct cs_etm_queue *etmq, u8 trace_chan_id);
+int cs_etm__process_auxtrace_info_full(union perf_event *event __maybe_unused,
+				       struct perf_session *session __maybe_unused);
 #else
 static inline int
-cs_etm__process_auxtrace_info(union perf_event *event __maybe_unused,
-			      struct perf_session *session __maybe_unused)
-{
-	return -1;
-}
-
-static inline int cs_etm__get_cpu(u8 trace_chan_id __maybe_unused,
-				  int *cpu __maybe_unused)
+cs_etm__process_auxtrace_info_full(union perf_event *event __maybe_unused,
+				   struct perf_session *session __maybe_unused)
 {
+	pr_err("\nCS ETM Trace: OpenCSD is not linked in, please recompile with CORESIGHT=1\n");
 	return -1;
 }
-
-static inline int cs_etm__etmq_set_tid(
-				struct cs_etm_queue *etmq __maybe_unused,
-				pid_t tid __maybe_unused,
-				u8 trace_chan_id __maybe_unused)
-{
-	return -1;
-}
-
-static inline bool cs_etm__etmq_is_timeless(
-				struct cs_etm_queue *etmq __maybe_unused)
-{
-	/* What else to return? */
-	return true;
-}
-
-static inline void cs_etm__etmq_set_traceid_queue_timestamp(
-				struct cs_etm_queue *etmq __maybe_unused,
-				u8 trace_chan_id __maybe_unused) {}
-
-static inline struct cs_etm_packet_queue *cs_etm__etmq_get_packet_queue(
-				struct cs_etm_queue *etmq __maybe_unused,
-				u8 trace_chan_id __maybe_unused)
-{
-	return NULL;
-}
 #endif
 
 #endif
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 9e0aee276df8..b842273458b8 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -19,7 +19,6 @@
 #include <babeltrace/ctf-writer/event-fields.h>
 #include <babeltrace/ctf-ir/utils.h>
 #include <babeltrace/ctf/events.h>
-#include <traceevent/event-parse.h>
 #include "asm/bug.h"
 #include "data-convert.h"
 #include "session.h"
@@ -34,6 +33,11 @@
 #include <linux/time64.h>
 #include "util.h"
 #include "clockid.h"
+#include "util/sample.h"
+
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
 
 #define pr_N(n, fmt, ...) \
 	eprintf(n, debug_data_convert, fmt, ##__VA_ARGS__)
@@ -318,8 +322,10 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 		offset = tmp_val;
 		len = offset >> 16;
 		offset &= 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 		if (flags & TEP_FIELD_IS_RELATIVE)
 			offset += fmtf->offset + fmtf->size;
+#endif
 	}
 
 	if (flags & TEP_FIELD_IS_ARRAY) {
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 613d6ae82663..ba9d93ce9463 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -27,6 +27,10 @@
 #include "util/thread.h"
 #include "util/tool.h"
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 struct convert_json {
 	struct perf_tool tool;
 	FILE *out;
@@ -217,6 +221,27 @@ static int process_sample_event(struct perf_tool *tool,
 	}
 	output_json_format(out, false, 3, "]");
 
+#ifdef HAVE_LIBTRACEEVENT
+	if (sample->raw_data) {
+		int i;
+		struct tep_format_field **fields;
+
+		fields = tep_event_fields(evsel->tp_format);
+		if (fields) {
+			i = 0;
+			while (fields[i]) {
+				struct trace_seq s;
+
+				trace_seq_init(&s);
+				tep_print_field(&s, sample->raw_data, fields[i]);
+				output_json_key_string(out, true, 3, fields[i]->name, s.buffer);
+
+				i++;
+			}
+			free(fields);
+		}
+	}
+#endif
 	output_json_format(out, false, 2, "}");
 	return 0;
 }
@@ -293,7 +318,9 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 			.exit           = perf_event__process_exit,
 			.fork           = perf_event__process_fork,
 			.lost           = perf_event__process_lost,
+#ifdef HAVE_LIBTRACEEVENT
 			.tracing_data   = perf_event__process_tracing_data,
+#endif
 			.build_id       = perf_event__process_build_id,
 			.id_index       = perf_event__process_id_index,
 			.auxtrace_info  = perf_event__process_auxtrace_info,
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 609ca1671501..b07414409771 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -123,7 +123,7 @@ int cu_find_lineinfo(Dwarf_Die *cu_die, Dwarf_Addr addr,
 	if (die_find_realfunc(cu_die, addr, &die_mem)
 	    && die_entrypc(&die_mem, &faddr) == 0 &&
 	    faddr == addr) {
-		*fname = dwarf_decl_file(&die_mem);
+		*fname = die_get_decl_file(&die_mem);
 		dwarf_decl_line(&die_mem, lineno);
 		goto out;
 	}
@@ -137,7 +137,7 @@ int cu_find_lineinfo(Dwarf_Die *cu_die, Dwarf_Addr addr,
 	}
 
 out:
-	return *lineno ?: -ENOENT;
+	return (*lineno && *fname) ? *lineno : -ENOENT;
 }
 
 static int __die_find_inline_cb(Dwarf_Die *die_mem, void *data);
@@ -308,26 +308,13 @@ static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name,
 {
 	Dwarf_Attribute attr;
 
-	if (dwarf_attr(tp_die, attr_name, &attr) == NULL ||
+	if (dwarf_attr_integrate(tp_die, attr_name, &attr) == NULL ||
 	    dwarf_formudata(&attr, result) != 0)
 		return -ENOENT;
 
 	return 0;
 }
 
-/* Get attribute and translate it as a sdata */
-static int die_get_attr_sdata(Dwarf_Die *tp_die, unsigned int attr_name,
-			      Dwarf_Sword *result)
-{
-	Dwarf_Attribute attr;
-
-	if (dwarf_attr(tp_die, attr_name, &attr) == NULL ||
-	    dwarf_formsdata(&attr, result) != 0)
-		return -ENOENT;
-
-	return 0;
-}
-
 /**
  * die_is_signed_type - Check whether a type DIE is signed or not
  * @tp_die: a DIE of a type
@@ -467,9 +454,9 @@ int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs)
 /* Get the call file index number in CU DIE */
 static int die_get_call_fileno(Dwarf_Die *in_die)
 {
-	Dwarf_Sword idx;
+	Dwarf_Word idx;
 
-	if (die_get_attr_sdata(in_die, DW_AT_call_file, &idx) == 0)
+	if (die_get_attr_udata(in_die, DW_AT_call_file, &idx) == 0)
 		return (int)idx;
 	else
 		return -ENOENT;
@@ -478,14 +465,27 @@ static int die_get_call_fileno(Dwarf_Die *in_die)
 /* Get the declared file index number in CU DIE */
 static int die_get_decl_fileno(Dwarf_Die *pdie)
 {
-	Dwarf_Sword idx;
+	Dwarf_Word idx;
 
-	if (die_get_attr_sdata(pdie, DW_AT_decl_file, &idx) == 0)
+	if (die_get_attr_udata(pdie, DW_AT_decl_file, &idx) == 0)
 		return (int)idx;
 	else
 		return -ENOENT;
 }
 
+/* Return the file name by index */
+static const char *die_get_file_name(Dwarf_Die *dw_die, int idx)
+{
+	Dwarf_Die cu_die;
+	Dwarf_Files *files;
+
+	if (idx < 0 || !dwarf_diecu(dw_die, &cu_die, NULL, NULL) ||
+	    dwarf_getsrcfiles(&cu_die, &files, NULL) != 0)
+		return NULL;
+
+	return dwarf_filesrc(files, idx, NULL, NULL);
+}
+
 /**
  * die_get_call_file - Get callsite file name of inlined function instance
  * @in_die: a DIE of an inlined function instance
@@ -495,18 +495,22 @@ static int die_get_decl_fileno(Dwarf_Die *pdie)
  */
 const char *die_get_call_file(Dwarf_Die *in_die)
 {
-	Dwarf_Die cu_die;
-	Dwarf_Files *files;
-	int idx;
-
-	idx = die_get_call_fileno(in_die);
-	if (idx < 0 || !dwarf_diecu(in_die, &cu_die, NULL, NULL) ||
-	    dwarf_getsrcfiles(&cu_die, &files, NULL) != 0)
-		return NULL;
-
-	return dwarf_filesrc(files, idx, NULL, NULL);
+	return die_get_file_name(in_die, die_get_call_fileno(in_die));
 }
 
+/**
+ * die_get_decl_file - Find the declared file name of this DIE
+ * @dw_die: a DIE for something declared.
+ *
+ * Get declared file name of @dw_die.
+ * NOTE: Since some version of clang DWARF5 implementation incorrectly uses
+ * file index 0 for DW_AT_decl_file, die_get_decl_file() will return NULL for
+ * such cases. Use this function instead.
+ */
+const char *die_get_decl_file(Dwarf_Die *dw_die)
+{
+	return die_get_file_name(dw_die, die_get_decl_fileno(dw_die));
+}
 
 /**
  * die_find_child - Generic DIE search function in DIE tree
@@ -790,7 +794,7 @@ static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data)
 	}
 
 	if (addr) {
-		fname = dwarf_decl_file(in_die);
+		fname = die_get_decl_file(in_die);
 		if (fname && dwarf_decl_line(in_die, &lineno) == 0) {
 			lw->retval = lw->callback(fname, lineno, addr, lw->data);
 			if (lw->retval != 0)
@@ -818,7 +822,7 @@ static int __die_walk_funclines(Dwarf_Die *sp_die, bool recursive,
 	int lineno;
 
 	/* Handle function declaration line */
-	fname = dwarf_decl_file(sp_die);
+	fname = die_get_decl_file(sp_die);
 	if (fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
 	    die_entrypc(sp_die, &addr) == 0) {
 		lw.retval = callback(fname, lineno, addr, data);
@@ -873,7 +877,12 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data)
 	if (dwarf_tag(rt_die) != DW_TAG_compile_unit) {
 		cu_die = dwarf_diecu(rt_die, &die_mem, NULL, NULL);
 		dwarf_decl_line(rt_die, &decl);
-		decf = dwarf_decl_file(rt_die);
+		decf = die_get_decl_file(rt_die);
+		if (!decf) {
+			pr_debug2("Failed to get the declared file name of %s\n",
+				  dwarf_diename(rt_die));
+			return -EINVAL;
+		}
 	} else
 		cu_die = rt_die;
 	if (!cu_die) {
@@ -923,7 +932,7 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data)
 
 				dwarf_decl_line(&die_mem, &inl);
 				if (inl != decl ||
-				    decf != dwarf_decl_file(&die_mem))
+				    decf != die_get_decl_file(&die_mem))
 					continue;
 			}
 		}
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 7ee0fa19b5c4..7ec8bc1083bb 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -50,6 +50,9 @@ int die_get_call_lineno(Dwarf_Die *in_die);
 /* Get callsite file name of inlined function instance */
 const char *die_get_call_file(Dwarf_Die *in_die);
 
+/* Get declared file name of a DIE */
+const char *die_get_decl_file(Dwarf_Die *dw_die);
+
 /* Get type die */
 Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 12eae6917022..6663a676eadc 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -12,11 +12,10 @@
 #include <perf/event.h>
 #include <linux/types.h>
 
-#include "perf_regs.h"
-
 struct dso;
 struct machine;
 struct perf_event_attr;
+struct perf_sample;
 
 #ifdef __LP64__
 /*
@@ -44,61 +43,6 @@ struct perf_event_attr;
 /* perf sample has 16 bits size limit */
 #define PERF_SAMPLE_MAX_SIZE (1 << 16)
 
-/* number of register is bound by the number of bits in regs_dump::mask (64) */
-#define PERF_SAMPLE_REGS_CACHE_SIZE (8 * sizeof(u64))
-
-struct regs_dump {
-	u64 abi;
-	u64 mask;
-	u64 *regs;
-
-	/* Cached values/mask filled by first register access. */
-	u64 cache_regs[PERF_SAMPLE_REGS_CACHE_SIZE];
-	u64 cache_mask;
-};
-
-struct stack_dump {
-	u16 offset;
-	u64 size;
-	char *data;
-};
-
-struct sample_read_value {
-	u64 value;
-	u64 id;   /* only if PERF_FORMAT_ID */
-	u64 lost; /* only if PERF_FORMAT_LOST */
-};
-
-struct sample_read {
-	u64 time_enabled;
-	u64 time_running;
-	union {
-		struct {
-			u64 nr;
-			struct sample_read_value *values;
-		} group;
-		struct sample_read_value one;
-	};
-};
-
-static inline size_t sample_read_value_size(u64 read_format)
-{
-	/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
-	if (read_format & PERF_FORMAT_LOST)
-		return sizeof(struct sample_read_value);
-	else
-		return offsetof(struct sample_read_value, lost);
-}
-
-static inline struct sample_read_value *
-next_sample_read_value(struct sample_read_value *v, u64 read_format)
-{
-	return (void *)v + sample_read_value_size(read_format);
-}
-
-#define sample_read_group__for_each(v, nr, rf)		\
-	for (int __i = 0; __i < (int)nr; v = next_sample_read_value(v, rf), __i++)
-
 struct ip_callchain {
 	u64 nr;
 	u64 ips[];
@@ -140,52 +84,6 @@ enum {
 	PERF_IP_FLAG_VMENTRY		|\
 	PERF_IP_FLAG_VMEXIT)
 
-#define MAX_INSN 16
-
-struct aux_sample {
-	u64 size;
-	void *data;
-};
-
-struct perf_sample {
-	u64 ip;
-	u32 pid, tid;
-	u64 time;
-	u64 addr;
-	u64 id;
-	u64 stream_id;
-	u64 period;
-	u64 weight;
-	u64 transaction;
-	u64 insn_cnt;
-	u64 cyc_cnt;
-	u32 cpu;
-	u32 raw_size;
-	u64 data_src;
-	u64 phys_addr;
-	u64 data_page_size;
-	u64 code_page_size;
-	u64 cgroup;
-	u32 flags;
-	u32 machine_pid;
-	u32 vcpu;
-	u16 insn_len;
-	u8  cpumode;
-	u16 misc;
-	u16 ins_lat;
-	u16 p_stage_cyc;
-	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
-	char insn[MAX_INSN];
-	void *raw_data;
-	struct ip_callchain *callchain;
-	struct branch_stack *branch_stack;
-	struct regs_dump  user_regs;
-	struct regs_dump  intr_regs;
-	struct stack_dump user_stack;
-	struct sample_read read;
-	struct aux_sample aux_sample;
-};
-
 #define PERF_MEM_DATA_SRC_NONE \
 	(PERF_MEM_S(OP, NA) |\
 	 PERF_MEM_S(LVL, NA) |\
@@ -344,15 +242,6 @@ struct perf_synth_intel_iflag_chg {
 	u64	branch_ip; /* If via_branch */
 };
 
-/*
- * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get
- * 8-byte alignment.
- */
-static inline void *perf_sample__synth_ptr(struct perf_sample *sample)
-{
-	return sample->raw_data - 4;
-}
-
 static inline void *perf_synth__raw_data(void *p)
 {
 	return p + 4;
@@ -446,19 +335,8 @@ int perf_event__process(struct perf_tool *tool,
 			struct perf_sample *sample,
 			struct machine *machine);
 
-struct addr_location;
-
-int machine__resolve(struct machine *machine, struct addr_location *al,
-		     struct perf_sample *sample);
-
-void addr_location__put(struct addr_location *al);
-
-struct thread;
-
 bool is_bts_event(struct perf_event_attr *attr);
 bool sample_addr_correlates_sym(struct perf_event_attr *attr);
-void thread__resolve(struct thread *thread, struct addr_location *al,
-		     struct perf_sample *sample);
 
 const char *perf_event__name(unsigned int id);
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 6612b00949e7..817df2504a1e 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -24,11 +24,13 @@
 #include "../perf.h"
 #include "asm/bug.h"
 #include "bpf-event.h"
+#include "util/event.h"
 #include "util/string2.h"
 #include "util/perf_api_probe.h"
 #include "util/evsel_fprintf.h"
 #include "util/evlist-hybrid.h"
 #include "util/pmu.h"
+#include "util/sample.h"
 #include <signal.h>
 #include <unistd.h>
 #include <sched.h>
@@ -228,7 +230,7 @@ out:
 	return err;
 }
 
-void evlist__set_leader(struct evlist *evlist)
+static void evlist__set_leader(struct evlist *evlist)
 {
 	perf_evlist__set_leader(&evlist->core);
 }
@@ -288,6 +290,7 @@ struct evsel *evlist__add_aux_dummy(struct evlist *evlist, bool system_wide)
 	return evsel;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide)
 {
 	struct evsel *evsel = evsel__newtp_idx("sched", "sched_switch", 0);
@@ -303,7 +306,8 @@ struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide)
 
 	evlist__add(evlist, evsel);
 	return evsel;
-};
+}
+#endif
 
 int evlist__add_attrs(struct evlist *evlist, struct perf_event_attr *attrs, size_t nr_attrs)
 {
@@ -374,6 +378,7 @@ struct evsel *evlist__find_tracepoint_by_name(struct evlist *evlist, const char
 	return NULL;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name, void *handler)
 {
 	struct evsel *evsel = evsel__newtp(sys, name);
@@ -385,6 +390,7 @@ int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name,
 	evlist__add(evlist, evsel);
 	return 0;
 }
+#endif
 
 struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity)
 {
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 16734c6756b3..01fa9d592c5a 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -127,7 +127,9 @@ static inline struct evsel *evlist__add_dummy_on_all_cpus(struct evlist *evlist)
 {
 	return evlist__add_aux_dummy(evlist, true);
 }
+#ifdef HAVE_LIBTRACEEVENT
 struct evsel *evlist__add_sched_switch(struct evlist *evlist, bool system_wide);
+#endif
 
 int evlist__add_sb_event(struct evlist *evlist, struct perf_event_attr *attr,
 			 evsel__sb_cb_t cb, void *data);
@@ -135,7 +137,9 @@ void evlist__set_cb(struct evlist *evlist, evsel__sb_cb_t cb, void *data);
 int evlist__start_sb_thread(struct evlist *evlist, struct target *target);
 void evlist__stop_sb_thread(struct evlist *evlist);
 
+#ifdef HAVE_LIBTRACEEVENT
 int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name, void *handler);
+#endif
 
 int __evlist__set_tracepoints_handlers(struct evlist *evlist,
 				       const struct evsel_str_handler *assocs,
@@ -217,8 +221,6 @@ void evlist__set_selected(struct evlist *evlist, struct evsel *evsel);
 int evlist__create_maps(struct evlist *evlist, struct target *target);
 int evlist__apply_filters(struct evlist *evlist, struct evsel **err_evsel);
 
-void evlist__set_leader(struct evlist *evlist);
-
 u64 __evlist__combined_sample_type(struct evlist *evlist);
 u64 evlist__combined_sample_type(struct evlist *evlist);
 u64 evlist__combined_branch_type(struct evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 76605fde3507..999dd1700502 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -12,7 +12,6 @@
 #include <linux/bitops.h>
 #include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
-#include <traceevent/event-parse.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/perf_event.h>
 #include <linux/compiler.h>
@@ -46,20 +45,21 @@
 #include "string2.h"
 #include "memswap.h"
 #include "util.h"
-#ifdef HAVE_LIBBPF_SUPPORT
-#include <bpf/hashmap.h>
-#else
 #include "util/hashmap.h"
-#endif
 #include "pmu-hybrid.h"
 #include "off_cpu.h"
 #include "../perf-sys.h"
 #include "util/parse-branch-options.h"
 #include <internal/xyarray.h>
 #include <internal/lib.h>
+#include <internal/threadmap.h>
 
 #include <linux/ctype.h>
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 struct perf_missing_features perf_missing_features;
 
 static clockid_t clockid;
@@ -442,7 +442,9 @@ struct evsel *evsel__clone(struct evsel *orig)
 			goto out_err;
 	}
 	evsel->cgrp = cgroup__get(orig->cgrp);
+#ifdef HAVE_LIBTRACEEVENT
 	evsel->tp_format = orig->tp_format;
+#endif
 	evsel->handler = orig->handler;
 	evsel->core.leader = orig->core.leader;
 
@@ -467,6 +469,7 @@ struct evsel *evsel__clone(struct evsel *orig)
 	evsel->collect_stat = orig->collect_stat;
 	evsel->weak_group = orig->weak_group;
 	evsel->use_config_name = orig->use_config_name;
+	evsel->pmu = orig->pmu;
 
 	if (evsel__copy_config_terms(evsel, orig) < 0)
 		goto out_err;
@@ -481,6 +484,7 @@ out_err:
 /*
  * Returns pointer with encoded error via <linux/err.h> interface.
  */
+#ifdef HAVE_LIBTRACEEVENT
 struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx)
 {
 	struct evsel *evsel = zalloc(perf_evsel__object.size);
@@ -518,6 +522,7 @@ out_free:
 out_err:
 	return ERR_PTR(err);
 }
+#endif
 
 const char *const evsel__hw_names[PERF_COUNT_HW_MAX] = {
 	"cycles",
@@ -1525,13 +1530,8 @@ void evsel__compute_deltas(struct evsel *evsel, int cpu_map_idx, int thread,
 	if (!evsel->prev_raw_counts)
 		return;
 
-	if (cpu_map_idx == -1) {
-		tmp = evsel->prev_raw_counts->aggr;
-		evsel->prev_raw_counts->aggr = *count;
-	} else {
-		tmp = *perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
-		*perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread) = *count;
-	}
+	tmp = *perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread);
+	*perf_counts(evsel->prev_raw_counts, cpu_map_idx, thread) = *count;
 
 	count->val = count->val - tmp.val;
 	count->ena = count->ena - tmp.ena;
@@ -1966,17 +1966,16 @@ bool evsel__detect_missing_features(struct evsel *evsel)
 		perf_missing_features.mmap2 = true;
 		pr_debug2_peo("switching off mmap2\n");
 		return true;
-	} else if ((evsel->core.attr.exclude_guest || evsel->core.attr.exclude_host) &&
-		   (evsel->pmu == NULL || evsel->pmu->missing_features.exclude_guest)) {
-		if (evsel->pmu == NULL) {
+	} else if (evsel->core.attr.exclude_guest || evsel->core.attr.exclude_host) {
+		if (evsel->pmu == NULL)
 			evsel->pmu = evsel__find_pmu(evsel);
-			if (evsel->pmu)
-				evsel->pmu->missing_features.exclude_guest = true;
-			else {
-				/* we cannot find PMU, disable attrs now */
-				evsel->core.attr.exclude_host = false;
-				evsel->core.attr.exclude_guest = false;
-			}
+
+		if (evsel->pmu)
+			evsel->pmu->missing_features.exclude_guest = true;
+		else {
+			/* we cannot find PMU, disable attrs now */
+			evsel->core.attr.exclude_host = false;
+			evsel->core.attr.exclude_guest = false;
 		}
 
 		if (evsel->exclude_GH) {
@@ -2328,11 +2327,8 @@ u64 evsel__bitfield_swap_branch_flags(u64 value)
 	 * as it has variable bit-field sizes. Instead the
 	 * macro takes the bit-field position/size,
 	 * swaps it based on the host endianness.
-	 *
-	 * tep_is_bigendian() is used here instead of
-	 * bigendian() to avoid python test fails.
 	 */
-	if (tep_is_bigendian()) {
+	if (host_is_bigendian()) {
 		new_val = bitfield_swap(value, 0, 1);
 		new_val |= bitfield_swap(value, 1, 1);
 		new_val |= bitfield_swap(value, 2, 1);
@@ -2769,6 +2765,7 @@ u16 evsel__id_hdr_size(struct evsel *evsel)
 	return size;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 struct tep_format_field *evsel__field(struct evsel *evsel, const char *name)
 {
 	return tep_find_field(evsel->tp_format, name);
@@ -2787,8 +2784,10 @@ void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char
 	if (field->flags & TEP_FIELD_IS_DYNAMIC) {
 		offset = *(int *)(sample->raw_data + field->offset);
 		offset &= 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 		if (field->flags & TEP_FIELD_IS_RELATIVE)
 			offset += field->offset + field->size;
+#endif
 	}
 
 	return sample->raw_data + offset;
@@ -2842,6 +2841,7 @@ u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *n
 
 	return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
 }
+#endif
 
 bool evsel__fallback(struct evsel *evsel, int err, char *msg, size_t msgsize)
 {
@@ -3123,13 +3123,13 @@ void evsel__zero_per_pkg(struct evsel *evsel)
 
 	if (evsel->per_pkg_mask) {
 		hashmap__for_each_entry(evsel->per_pkg_mask, cur, bkt)
-			free((char *)cur->key);
+			free((void *)cur->pkey);
 
 		hashmap__clear(evsel->per_pkg_mask);
 	}
 }
 
-bool evsel__is_hybrid(struct evsel *evsel)
+bool evsel__is_hybrid(const struct evsel *evsel)
 {
 	return evsel->pmu_name && perf_pmu__is_hybrid(evsel->pmu_name);
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 989865e16aad..d572be41b960 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -10,8 +10,6 @@
 #include <internal/evsel.h>
 #include <perf/evsel.h>
 #include "symbol_conf.h"
-#include <internal/cpumap.h>
-#include <perf/cpumap.h>
 
 struct bpf_object;
 struct cgroup;
@@ -74,7 +72,9 @@ struct evsel {
 		char			*name;
 		char			*group_name;
 		const char		*pmu_name;
+#ifdef HAVE_LIBTRACEEVENT
 		struct tep_event	*tp_format;
+#endif
 		char			*filter;
 		unsigned long		max_events;
 		double			scale;
@@ -225,11 +225,14 @@ static inline struct evsel *evsel__new(struct perf_event_attr *attr)
 }
 
 struct evsel *evsel__clone(struct evsel *orig);
-struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx);
 
 int copy_config_terms(struct list_head *dst, struct list_head *src);
 void free_config_terms(struct list_head *config_terms);
 
+
+#ifdef HAVE_LIBTRACEEVENT
+struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx);
+
 /*
  * Returns pointer with encoded error via <linux/err.h> interface.
  */
@@ -237,10 +240,13 @@ static inline struct evsel *evsel__newtp(const char *sys, const char *name)
 {
 	return evsel__newtp_idx(sys, name, 0);
 }
+#endif
 
 struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config);
 
+#ifdef HAVE_LIBTRACEEVENT
 struct tep_event *event_format__new(const char *sys, const char *name);
+#endif
 
 void evsel__init(struct evsel *evsel, struct perf_event_attr *attr, int idx);
 void evsel__exit(struct evsel *evsel);
@@ -325,6 +331,7 @@ bool evsel__precise_ip_fallback(struct evsel *evsel);
 
 struct perf_sample;
 
+#ifdef HAVE_LIBTRACEEVENT
 void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char *name);
 u64 evsel__intval(struct evsel *evsel, struct perf_sample *sample, const char *name);
 
@@ -332,6 +339,7 @@ static inline char *evsel__strval(struct evsel *evsel, struct perf_sample *sampl
 {
 	return evsel__rawptr(evsel, sample, name);
 }
+#endif
 
 struct tep_format_field;
 
@@ -498,7 +506,7 @@ struct perf_env *evsel__env(struct evsel *evsel);
 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist);
 
 void evsel__zero_per_pkg(struct evsel *evsel);
-bool evsel__is_hybrid(struct evsel *evsel);
+bool evsel__is_hybrid(const struct evsel *evsel);
 struct evsel *evsel__leader(struct evsel *evsel);
 bool evsel__has_leader(struct evsel *evsel, struct evsel *leader);
 bool evsel__is_leader(struct evsel *evsel);
diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
index 8c2ea8001329..bd22c4932d10 100644
--- a/tools/perf/util/evsel_fprintf.c
+++ b/tools/perf/util/evsel_fprintf.c
@@ -2,7 +2,6 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdbool.h>
-#include <traceevent/event-parse.h>
 #include "evsel.h"
 #include "util/evsel_fprintf.h"
 #include "util/event.h"
@@ -13,6 +12,10 @@
 #include "srcline.h"
 #include "dso.h"
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
 {
 	va_list args;
@@ -74,6 +77,7 @@ int evsel__fprintf(struct evsel *evsel, struct perf_attr_details *details, FILE
 					 term, (u64)evsel->core.attr.sample_freq);
 	}
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (details->trace_fields) {
 		struct tep_format_field *field;
 
@@ -96,6 +100,7 @@ int evsel__fprintf(struct evsel *evsel, struct perf_attr_details *details, FILE
 			field = field->next;
 		}
 	}
+#endif
 out:
 	fputc('\n', fp);
 	return ++printed;
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index aaacf514dc09..00dcde35e0d3 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -11,6 +11,7 @@
 #include "expr.h"
 #include "expr-bison.h"
 #include "expr-flex.h"
+#include "util/hashmap.h"
 #include "smt.h"
 #include "tsc.h"
 #include <linux/err.h>
@@ -46,7 +47,7 @@ struct expr_id_data {
 	} kind;
 };
 
-static size_t key_hash(const void *key, void *ctx __maybe_unused)
+static size_t key_hash(long key, void *ctx __maybe_unused)
 {
 	const char *str = (const char *)key;
 	size_t hash = 0;
@@ -59,8 +60,7 @@ static size_t key_hash(const void *key, void *ctx __maybe_unused)
 	return hash;
 }
 
-static bool key_equal(const void *key1, const void *key2,
-		    void *ctx __maybe_unused)
+static bool key_equal(long key1, long key2, void *ctx __maybe_unused)
 {
 	return !strcmp((const char *)key1, (const char *)key2);
 }
@@ -84,8 +84,8 @@ void ids__free(struct hashmap *ids)
 		return;
 
 	hashmap__for_each_entry(ids, cur, bkt) {
-		free((char *)cur->key);
-		free(cur->value);
+		free((void *)cur->pkey);
+		free((void *)cur->pvalue);
 	}
 
 	hashmap__free(ids);
@@ -97,8 +97,7 @@ int ids__insert(struct hashmap *ids, const char *id)
 	char *old_key = NULL;
 	int ret;
 
-	ret = hashmap__set(ids, id, data_ptr,
-			   (const void **)&old_key, (void **)&old_data);
+	ret = hashmap__set(ids, id, data_ptr, &old_key, &old_data);
 	if (ret)
 		free(data_ptr);
 	free(old_key);
@@ -127,8 +126,7 @@ struct hashmap *ids__union(struct hashmap *ids1, struct hashmap *ids2)
 		ids2 = tmp;
 	}
 	hashmap__for_each_entry(ids2, cur, bkt) {
-		ret = hashmap__set(ids1, cur->key, cur->value,
-				(const void **)&old_key, (void **)&old_data);
+		ret = hashmap__set(ids1, cur->key, cur->value, &old_key, &old_data);
 		free(old_key);
 		free(old_data);
 
@@ -169,8 +167,7 @@ int expr__add_id_val_source_count(struct expr_parse_ctx *ctx, const char *id,
 	data_ptr->val.source_count = source_count;
 	data_ptr->kind = EXPR_ID_DATA__VALUE;
 
-	ret = hashmap__set(ctx->ids, id, data_ptr,
-			   (const void **)&old_key, (void **)&old_data);
+	ret = hashmap__set(ctx->ids, id, data_ptr, &old_key, &old_data);
 	if (ret)
 		free(data_ptr);
 	free(old_key);
@@ -205,8 +202,7 @@ int expr__add_ref(struct expr_parse_ctx *ctx, struct metric_ref *ref)
 	data_ptr->ref.metric_expr = ref->metric_expr;
 	data_ptr->kind = EXPR_ID_DATA__REF;
 
-	ret = hashmap__set(ctx->ids, name, data_ptr,
-			   (const void **)&old_key, (void **)&old_data);
+	ret = hashmap__set(ctx->ids, name, data_ptr, &old_key, &old_data);
 	if (ret)
 		free(data_ptr);
 
@@ -221,7 +217,7 @@ int expr__add_ref(struct expr_parse_ctx *ctx, struct metric_ref *ref)
 int expr__get_id(struct expr_parse_ctx *ctx, const char *id,
 		 struct expr_id_data **data)
 {
-	return hashmap__find(ctx->ids, id, (void **)data) ? 0 : -1;
+	return hashmap__find(ctx->ids, id, data) ? 0 : -1;
 }
 
 bool expr__subset_of_ids(struct expr_parse_ctx *haystack,
@@ -232,7 +228,7 @@ bool expr__subset_of_ids(struct expr_parse_ctx *haystack,
 	struct expr_id_data *data;
 
 	hashmap__for_each_entry(needles->ids, cur, bkt) {
-		if (expr__get_id(haystack, cur->key, &data))
+		if (expr__get_id(haystack, cur->pkey, &data))
 			return false;
 	}
 	return true;
@@ -282,8 +278,7 @@ void expr__del_id(struct expr_parse_ctx *ctx, const char *id)
 	struct expr_id_data *old_val = NULL;
 	char *old_key = NULL;
 
-	hashmap__delete(ctx->ids, id,
-			(const void **)&old_key, (void **)&old_val);
+	hashmap__delete(ctx->ids, id, &old_key, &old_val);
 	free(old_key);
 	free(old_val);
 }
@@ -314,8 +309,8 @@ void expr__ctx_clear(struct expr_parse_ctx *ctx)
 	size_t bkt;
 
 	hashmap__for_each_entry(ctx->ids, cur, bkt) {
-		free((char *)cur->key);
-		free(cur->value);
+		free((void *)cur->pkey);
+		free(cur->pvalue);
 	}
 	hashmap__clear(ctx->ids);
 }
@@ -330,8 +325,8 @@ void expr__ctx_free(struct expr_parse_ctx *ctx)
 
 	free(ctx->sctx.user_requested_cpu_list);
 	hashmap__for_each_entry(ctx->ids, cur, bkt) {
-		free((char *)cur->key);
-		free(cur->value);
+		free((void *)cur->pkey);
+		free(cur->pvalue);
 	}
 	hashmap__free(ctx->ids);
 	free(ctx);
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index d6c1668dc1a0..029271540fb0 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -2,12 +2,7 @@
 #ifndef PARSE_CTX_H
 #define PARSE_CTX_H 1
 
-#ifdef HAVE_LIBBPF_SUPPORT
-#include <bpf/hashmap.h>
-#else
-#include "util/hashmap.h"
-#endif
-
+struct hashmap;
 struct metric_ref;
 
 struct expr_scanner_ctx {
diff --git a/tools/perf/util/hashmap.c b/tools/perf/util/hashmap.c
index aeb09c288716..140ee4055676 100644
--- a/tools/perf/util/hashmap.c
+++ b/tools/perf/util/hashmap.c
@@ -128,7 +128,7 @@ static int hashmap_grow(struct hashmap *map)
 }
 
 static bool hashmap_find_entry(const struct hashmap *map,
-			       const void *key, size_t hash,
+			       const long key, size_t hash,
 			       struct hashmap_entry ***pprev,
 			       struct hashmap_entry **entry)
 {
@@ -151,18 +151,18 @@ static bool hashmap_find_entry(const struct hashmap *map,
 	return false;
 }
 
-int hashmap__insert(struct hashmap *map, const void *key, void *value,
-		    enum hashmap_insert_strategy strategy,
-		    const void **old_key, void **old_value)
+int hashmap_insert(struct hashmap *map, long key, long value,
+		   enum hashmap_insert_strategy strategy,
+		   long *old_key, long *old_value)
 {
 	struct hashmap_entry *entry;
 	size_t h;
 	int err;
 
 	if (old_key)
-		*old_key = NULL;
+		*old_key = 0;
 	if (old_value)
-		*old_value = NULL;
+		*old_value = 0;
 
 	h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits);
 	if (strategy != HASHMAP_APPEND &&
@@ -203,7 +203,7 @@ int hashmap__insert(struct hashmap *map, const void *key, void *value,
 	return 0;
 }
 
-bool hashmap__find(const struct hashmap *map, const void *key, void **value)
+bool hashmap_find(const struct hashmap *map, long key, long *value)
 {
 	struct hashmap_entry *entry;
 	size_t h;
@@ -217,8 +217,8 @@ bool hashmap__find(const struct hashmap *map, const void *key, void **value)
 	return true;
 }
 
-bool hashmap__delete(struct hashmap *map, const void *key,
-		     const void **old_key, void **old_value)
+bool hashmap_delete(struct hashmap *map, long key,
+		    long *old_key, long *old_value)
 {
 	struct hashmap_entry **pprev, *entry;
 	size_t h;
diff --git a/tools/perf/util/hashmap.h b/tools/perf/util/hashmap.h
index 10a4c4cd13cf..0a5bf1937a7c 100644
--- a/tools/perf/util/hashmap.h
+++ b/tools/perf/util/hashmap.h
@@ -40,12 +40,32 @@ static inline size_t str_hash(const char *s)
 	return h;
 }
 
-typedef size_t (*hashmap_hash_fn)(const void *key, void *ctx);
-typedef bool (*hashmap_equal_fn)(const void *key1, const void *key2, void *ctx);
+typedef size_t (*hashmap_hash_fn)(long key, void *ctx);
+typedef bool (*hashmap_equal_fn)(long key1, long key2, void *ctx);
 
+/*
+ * Hashmap interface is polymorphic, keys and values could be either
+ * long-sized integers or pointers, this is achieved as follows:
+ * - interface functions that operate on keys and values are hidden
+ *   behind auxiliary macros, e.g. hashmap_insert <-> hashmap__insert;
+ * - these auxiliary macros cast the key and value parameters as
+ *   long or long *, so the user does not have to specify the casts explicitly;
+ * - for pointer parameters (e.g. old_key) the size of the pointed
+ *   type is verified by hashmap_cast_ptr using _Static_assert;
+ * - when iterating using hashmap__for_each_* forms
+ *   hasmap_entry->key should be used for integer keys and
+ *   hasmap_entry->pkey should be used for pointer keys,
+ *   same goes for values.
+ */
 struct hashmap_entry {
-	const void *key;
-	void *value;
+	union {
+		long key;
+		const void *pkey;
+	};
+	union {
+		long value;
+		void *pvalue;
+	};
 	struct hashmap_entry *next;
 };
 
@@ -102,6 +122,13 @@ enum hashmap_insert_strategy {
 	HASHMAP_APPEND,
 };
 
+#define hashmap_cast_ptr(p) ({								\
+	_Static_assert((__builtin_constant_p((p)) ? (p) == NULL : 0) ||			\
+				sizeof(*(p)) == sizeof(long),				\
+		       #p " pointee should be a long-sized integer or a pointer");	\
+	(long *)(p);									\
+})
+
 /*
  * hashmap__insert() adds key/value entry w/ various semantics, depending on
  * provided strategy value. If a given key/value pair replaced already
@@ -109,42 +136,38 @@ enum hashmap_insert_strategy {
  * through old_key and old_value to allow calling code do proper memory
  * management.
  */
-int hashmap__insert(struct hashmap *map, const void *key, void *value,
-		    enum hashmap_insert_strategy strategy,
-		    const void **old_key, void **old_value);
+int hashmap_insert(struct hashmap *map, long key, long value,
+		   enum hashmap_insert_strategy strategy,
+		   long *old_key, long *old_value);
 
-static inline int hashmap__add(struct hashmap *map,
-			       const void *key, void *value)
-{
-	return hashmap__insert(map, key, value, HASHMAP_ADD, NULL, NULL);
-}
+#define hashmap__insert(map, key, value, strategy, old_key, old_value) \
+	hashmap_insert((map), (long)(key), (long)(value), (strategy),  \
+		       hashmap_cast_ptr(old_key),		       \
+		       hashmap_cast_ptr(old_value))
 
-static inline int hashmap__set(struct hashmap *map,
-			       const void *key, void *value,
-			       const void **old_key, void **old_value)
-{
-	return hashmap__insert(map, key, value, HASHMAP_SET,
-			       old_key, old_value);
-}
+#define hashmap__add(map, key, value) \
+	hashmap__insert((map), (key), (value), HASHMAP_ADD, NULL, NULL)
 
-static inline int hashmap__update(struct hashmap *map,
-				  const void *key, void *value,
-				  const void **old_key, void **old_value)
-{
-	return hashmap__insert(map, key, value, HASHMAP_UPDATE,
-			       old_key, old_value);
-}
+#define hashmap__set(map, key, value, old_key, old_value) \
+	hashmap__insert((map), (key), (value), HASHMAP_SET, (old_key), (old_value))
 
-static inline int hashmap__append(struct hashmap *map,
-				  const void *key, void *value)
-{
-	return hashmap__insert(map, key, value, HASHMAP_APPEND, NULL, NULL);
-}
+#define hashmap__update(map, key, value, old_key, old_value) \
+	hashmap__insert((map), (key), (value), HASHMAP_UPDATE, (old_key), (old_value))
+
+#define hashmap__append(map, key, value) \
+	hashmap__insert((map), (key), (value), HASHMAP_APPEND, NULL, NULL)
+
+bool hashmap_delete(struct hashmap *map, long key, long *old_key, long *old_value);
+
+#define hashmap__delete(map, key, old_key, old_value)		       \
+	hashmap_delete((map), (long)(key),			       \
+		       hashmap_cast_ptr(old_key),		       \
+		       hashmap_cast_ptr(old_value))
 
-bool hashmap__delete(struct hashmap *map, const void *key,
-		     const void **old_key, void **old_value);
+bool hashmap_find(const struct hashmap *map, long key, long *value);
 
-bool hashmap__find(const struct hashmap *map, const void *key, void **value);
+#define hashmap__find(map, key, value) \
+	hashmap_find((map), (long)(key), hashmap_cast_ptr(value))
 
 /*
  * hashmap__for_each_entry - iterate over all entries in hashmap
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 98dfaf84bd13..404d816ca124 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -6,6 +6,7 @@
 #include <sys/types.h>
 #include <byteswap.h>
 #include <unistd.h>
+#include <regex.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <linux/compiler.h>
@@ -55,6 +56,10 @@
 #include <linux/ctype.h>
 #include <internal/lib.h>
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 /*
  * magic2 = "PERFILE2"
  * must be a numerical value to let the endianness
@@ -79,12 +84,12 @@ struct perf_file_attr {
 
 void perf_header__set_feat(struct perf_header *header, int feat)
 {
-	set_bit(feat, header->adds_features);
+	__set_bit(feat, header->adds_features);
 }
 
 void perf_header__clear_feat(struct perf_header *header, int feat)
 {
-	clear_bit(feat, header->adds_features);
+	__clear_bit(feat, header->adds_features);
 }
 
 bool perf_header__has_feat(const struct perf_header *header, int feat)
@@ -298,6 +303,7 @@ static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize)
 	return 0;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int write_tracing_data(struct feat_fd *ff,
 			      struct evlist *evlist)
 {
@@ -306,6 +312,7 @@ static int write_tracing_data(struct feat_fd *ff,
 
 	return read_tracing_data(ff->fd, &evlist->core.entries);
 }
+#endif
 
 static int write_build_id(struct feat_fd *ff,
 			  struct evlist *evlist __maybe_unused)
@@ -1358,7 +1365,7 @@ static int memory_node__read(struct memory_node *n, unsigned long idx)
 	rewinddir(dir);
 
 	for_each_memory(phys, dir) {
-		set_bit(phys, n->set);
+		__set_bit(phys, n->set);
 	}
 
 	closedir(dir);
@@ -2394,12 +2401,14 @@ FEAT_PROCESS_STR_FUN(arch, arch);
 FEAT_PROCESS_STR_FUN(cpudesc, cpu_desc);
 FEAT_PROCESS_STR_FUN(cpuid, cpuid);
 
+#ifdef HAVE_LIBTRACEEVENT
 static int process_tracing_data(struct feat_fd *ff, void *data)
 {
 	ssize_t ret = trace_report(ff->fd, data, false);
 
 	return ret < 0 ? -1 : 0;
 }
+#endif
 
 static int process_build_id(struct feat_fd *ff, void *data __maybe_unused)
 {
@@ -3366,7 +3375,9 @@ err:
 const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE];
 
 const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
+#ifdef HAVE_LIBTRACEEVENT
 	FEAT_OPN(TRACING_DATA,	tracing_data,	false),
+#endif
 	FEAT_OPN(BUILD_ID,	build_id,	false),
 	FEAT_OPR(HOSTNAME,	hostname,	false),
 	FEAT_OPR(OSRELEASE,	osrelease,	false),
@@ -3952,7 +3963,7 @@ int perf_file_header__read(struct perf_file_header *header,
 
 		if (!test_bit(HEADER_HOSTNAME, header->adds_features)) {
 			bitmap_zero(header->adds_features, HEADER_FEAT_BITS);
-			set_bit(HEADER_BUILD_ID, header->adds_features);
+			__set_bit(HEADER_BUILD_ID, header->adds_features);
 		}
 	}
 
@@ -4082,6 +4093,7 @@ static int read_attr(int fd, struct perf_header *ph,
 	return ret <= 0 ? -1 : 0;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int evsel__prepare_tracepoint_event(struct evsel *evsel, struct tep_handle *pevent)
 {
 	struct tep_event *event;
@@ -4125,6 +4137,7 @@ static int evlist__prepare_tracepoint_events(struct evlist *evlist, struct tep_h
 
 	return 0;
 }
+#endif
 
 int perf_session__read_header(struct perf_session *session, int repipe_fd)
 {
@@ -4230,11 +4243,15 @@ int perf_session__read_header(struct perf_session *session, int repipe_fd)
 		lseek(fd, tmp, SEEK_SET);
 	}
 
+#ifdef HAVE_LIBTRACEEVENT
 	perf_header__process_sections(header, fd, &session->tevent,
 				      perf_file_section__process);
 
 	if (evlist__prepare_tracepoint_events(session->evlist, session->tevent.pevent))
 		goto out_delete_evlist;
+#else
+	perf_header__process_sections(header, fd, NULL, perf_file_section__process);
+#endif
 
 	return 0;
 out_errno:
@@ -4412,6 +4429,7 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 int perf_event__process_tracing_data(struct perf_session *session,
 				     union perf_event *event)
 {
@@ -4459,6 +4477,7 @@ int perf_event__process_tracing_data(struct perf_session *session,
 
 	return size_read + padding;
 }
+#endif
 
 int perf_event__process_build_id(struct perf_session *session,
 				 union perf_event *event)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 2d5e601ba60f..e3861ae62172 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -160,8 +160,10 @@ int perf_event__process_event_update(struct perf_tool *tool,
 				     union perf_event *event,
 				     struct evlist **pevlist);
 size_t perf_event__fprintf_event_update(union perf_event *event, FILE *fp);
+#ifdef HAVE_LIBTRACEEVENT
 int perf_event__process_tracing_data(struct perf_session *session,
 				     union perf_event *event);
+#endif
 int perf_event__process_build_id(struct perf_session *session,
 				 union perf_event *event);
 bool is_perf_magic(u64 magic);
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
index 1376077183f7..22308dd93010 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
@@ -18,6 +18,7 @@
 
 #include "intel-pt-insn-decoder.h"
 #include "dump-insn.h"
+#include "util/sample.h"
 
 #if INTEL_PT_INSN_BUF_SZ < MAX_INSN_SIZE || INTEL_PT_INSN_BUF_SZ > MAX_INSN
 #error Instruction buffer size too small
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index e3548ddef254..6d3921627e33 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -3142,6 +3142,7 @@ static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid,
 	return 1;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int intel_pt_process_switch(struct intel_pt *pt,
 				   struct perf_sample *sample)
 {
@@ -3165,6 +3166,7 @@ static int intel_pt_process_switch(struct intel_pt *pt,
 
 	return machine__set_current_tid(pt->machine, cpu, -1, tid);
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 static int intel_pt_context_switch_in(struct intel_pt *pt,
 				      struct perf_sample *sample)
@@ -3433,9 +3435,12 @@ static int intel_pt_process_event(struct perf_session *session,
 			return err;
 	}
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (pt->switch_evsel && event->header.type == PERF_RECORD_SAMPLE)
 		err = intel_pt_process_switch(pt, sample);
-	else if (event->header.type == PERF_RECORD_ITRACE_START)
+	else
+#endif
+	if (event->header.type == PERF_RECORD_ITRACE_START)
 		err = intel_pt_process_itrace_start(pt, event, sample);
 	else if (event->header.type == PERF_RECORD_AUX_OUTPUT_HW_ID)
 		err = intel_pt_process_aux_output_hw_id(pt, event, sample);
diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c
index 57dd49da28fe..b770bd473af7 100644
--- a/tools/perf/util/iostat.c
+++ b/tools/perf/util/iostat.c
@@ -48,6 +48,7 @@ __weak void iostat_print_counters(struct evlist *evlist __maybe_unused,
 				  struct perf_stat_config *config __maybe_unused,
 				  struct timespec *ts __maybe_unused,
 				  char *prefix __maybe_unused,
-				  iostat_print_counter_t print_cnt_cb __maybe_unused)
+				  iostat_print_counter_t print_cnt_cb __maybe_unused,
+				  void *arg __maybe_unused)
 {
 }
diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h
index 23c1c46a331a..a4e7299c5c2f 100644
--- a/tools/perf/util/iostat.h
+++ b/tools/perf/util/iostat.h
@@ -28,7 +28,7 @@ enum iostat_mode_t {
 
 extern enum iostat_mode_t iostat_mode;
 
-typedef void (*iostat_print_counter_t)(struct perf_stat_config *, struct evsel *, char *);
+typedef void (*iostat_print_counter_t)(struct perf_stat_config *, struct evsel *, void *);
 
 int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config);
 int iostat_parse(const struct option *opt, const char *str,
@@ -42,6 +42,6 @@ void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
 			 struct perf_stat_output_ctx *out);
 void iostat_print_counters(struct evlist *evlist,
 			   struct perf_stat_config *config, struct timespec *ts,
-			   char *prefix, iostat_print_counter_t print_cnt_cb);
+			   char *prefix, iostat_print_counter_t print_cnt_cb, void *arg);
 
 #endif /* _IOSTAT_H */
diff --git a/tools/perf/util/kwork.h b/tools/perf/util/kwork.h
index 320c0a6d2e08..53b7327550b8 100644
--- a/tools/perf/util/kwork.h
+++ b/tools/perf/util/kwork.h
@@ -1,16 +1,16 @@
 #ifndef PERF_UTIL_KWORK_H
 #define PERF_UTIL_KWORK_H
 
-#include "perf.h"
-
 #include "util/tool.h"
-#include "util/event.h"
-#include "util/evlist.h"
-#include "util/session.h"
 #include "util/time-utils.h"
 
-#include <linux/list.h>
 #include <linux/bitmap.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+struct perf_sample;
+struct perf_session;
 
 enum kwork_class_type {
 	KWORK_CLASS_IRQ,
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index 2dc797007419..650ffe336f3a 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -463,7 +463,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
 	char *pipe_template = NULL;
 	const char *opts = llvm_param.opts;
 	char *command_echo = NULL, *command_out;
-	char *perf_include_dir = system_path(PERF_INCLUDE_DIR);
+	char *libbpf_include_dir = system_path(LIBBPF_INCLUDE_DIR);
 
 	if (path[0] != '-' && realpath(path, abspath) == NULL) {
 		err = errno;
@@ -495,7 +495,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
 
 	snprintf(linux_version_code_str, sizeof(linux_version_code_str),
 		 "0x%x", kernel_version);
-	if (asprintf(&perf_bpf_include_opts, "-I%s/bpf", perf_include_dir) < 0)
+	if (asprintf(&perf_bpf_include_opts, "-I%s/", libbpf_include_dir) < 0)
 		goto errout;
 	force_set_env("NR_CPUS", nr_cpus_avail_str);
 	force_set_env("LINUX_VERSION_CODE", linux_version_code_str);
@@ -556,7 +556,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
 	free(kbuild_dir);
 	free(kbuild_include_opts);
 	free(perf_bpf_include_opts);
-	free(perf_include_dir);
+	free(libbpf_include_dir);
 
 	if (!p_obj_buf)
 		free(obj_buf);
@@ -572,7 +572,7 @@ errout:
 	free(kbuild_include_opts);
 	free(obj_buf);
 	free(perf_bpf_include_opts);
-	free(perf_include_dir);
+	free(libbpf_include_dir);
 	free(pipe_template);
 	if (p_obj_buf)
 		*p_obj_buf = NULL;
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index b8cb8830b7bc..47fd47fb56c1 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -91,7 +91,7 @@ struct thread_stat {
  * Number of stack trace entries to skip when finding callers.
  * The first few entries belong to the locking implementation itself.
  */
-#define CONTENTION_STACK_SKIP  3
+#define CONTENTION_STACK_SKIP  4
 
 /*
  * flags for lock:contention_begin
@@ -117,6 +117,7 @@ struct lock_contention {
 	int lost;
 	int max_stack;
 	int stack_skip;
+	int aggr_mode;
 };
 
 #ifdef HAVE_BPF_SKEL
@@ -145,6 +146,4 @@ static inline int lock_contention_read(struct lock_contention *con __maybe_unuse
 
 #endif  /* HAVE_BPF_SKEL */
 
-bool is_lock_function(struct machine *machine, u64 addr);
-
 #endif  /* PERF_LOCK_CONTENTION_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 76316e459c3d..803c9d1803dd 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -3336,3 +3336,43 @@ int machine__for_each_kernel_map(struct machine *machine, machine__map_t fn, voi
 	}
 	return err;
 }
+
+bool machine__is_lock_function(struct machine *machine, u64 addr)
+{
+	if (!machine->sched.text_start) {
+		struct map *kmap;
+		struct symbol *sym = machine__find_kernel_symbol_by_name(machine, "__sched_text_start", &kmap);
+
+		if (!sym) {
+			/* to avoid retry */
+			machine->sched.text_start = 1;
+			return false;
+		}
+
+		machine->sched.text_start = kmap->unmap_ip(kmap, sym->start);
+
+		/* should not fail from here */
+		sym = machine__find_kernel_symbol_by_name(machine, "__sched_text_end", &kmap);
+		machine->sched.text_end = kmap->unmap_ip(kmap, sym->start);
+
+		sym = machine__find_kernel_symbol_by_name(machine, "__lock_text_start", &kmap);
+		machine->lock.text_start = kmap->unmap_ip(kmap, sym->start);
+
+		sym = machine__find_kernel_symbol_by_name(machine, "__lock_text_end", &kmap);
+		machine->lock.text_end = kmap->unmap_ip(kmap, sym->start);
+	}
+
+	/* failed to get kernel symbols */
+	if (machine->sched.text_start == 1)
+		return false;
+
+	/* mutex and rwsem functions are in sched text section */
+	if (machine->sched.text_start <= addr && addr < machine->sched.text_end)
+		return true;
+
+	/* spinlock functions are in lock text section */
+	if (machine->lock.text_start <= addr && addr < machine->lock.text_end)
+		return true;
+
+	return false;
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 74935dfaa937..d034ecaf89c1 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -56,6 +56,10 @@ struct machine {
 	struct maps	  *kmaps;
 	struct map	  *vmlinux_map;
 	u64		  kernel_start;
+	struct {
+		u64	  text_start;
+		u64	  text_end;
+	} sched, lock;
 	pid_t		  *current_tid;
 	size_t		  current_tid_sz;
 	union { /* Tool specific area */
@@ -212,6 +216,7 @@ static inline bool machine__is_host(struct machine *machine)
 	return machine ? machine->pid == HOST_KERNEL_ID : false;
 }
 
+bool machine__is_lock_function(struct machine *machine, u64 addr);
 bool machine__is(struct machine *machine, const char *arch);
 bool machine__normalized_is(struct machine *machine, const char *arch);
 int machine__nr_cpus_avail(struct machine *machine);
@@ -305,4 +310,7 @@ int machine__create_extra_kernel_map(struct machine *machine,
 int machine__map_x86_64_entry_trampolines(struct machine *machine,
 					  struct dso *kernel);
 
+int machine__resolve(struct machine *machine, struct addr_location *al,
+		     struct perf_sample *sample);
+
 #endif /* __PERF_MACHINE_H */
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 4c98ac29ee13..b9c273ed080a 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -12,6 +12,7 @@
 #include "strbuf.h"
 #include "pmu.h"
 #include "pmu-hybrid.h"
+#include "print-events.h"
 #include "expr.h"
 #include "rblist.h"
 #include <string.h>
@@ -28,6 +29,7 @@
 #include "util.h"
 #include <asm/bug.h>
 #include "cgroup.h"
+#include "util/hashmap.h"
 
 struct metric_event *metricgroup__lookup(struct rblist *metric_events,
 					 struct evsel *evsel,
@@ -288,7 +290,7 @@ static int setup_metric_events(struct hashmap *ids,
 		 * combined or shared groups, this metric may not care
 		 * about this event.
 		 */
-		if (hashmap__find(ids, metric_id, (void **)&val_ptr)) {
+		if (hashmap__find(ids, metric_id, &val_ptr)) {
 			metric_events[matched_events++] = ev;
 
 			if (matched_events >= ids_size)
@@ -352,51 +354,65 @@ static bool match_pe_metric(const struct pmu_event *pe, const char *metric)
 	       match_metric(pe->metric_name, metric);
 }
 
+/** struct mep - RB-tree node for building printing information. */
 struct mep {
+	/** nd - RB-tree element. */
 	struct rb_node nd;
-	const char *name;
-	struct strlist *metrics;
+	/** @metric_group: Owned metric group name, separated others with ';'. */
+	char *metric_group;
+	const char *metric_name;
+	const char *metric_desc;
+	const char *metric_long_desc;
+	const char *metric_expr;
+	const char *metric_unit;
 };
 
 static int mep_cmp(struct rb_node *rb_node, const void *entry)
 {
 	struct mep *a = container_of(rb_node, struct mep, nd);
 	struct mep *b = (struct mep *)entry;
+	int ret;
 
-	return strcmp(a->name, b->name);
+	ret = strcmp(a->metric_group, b->metric_group);
+	if (ret)
+		return ret;
+
+	return strcmp(a->metric_name, b->metric_name);
 }
 
-static struct rb_node *mep_new(struct rblist *rl __maybe_unused,
-					const void *entry)
+static struct rb_node *mep_new(struct rblist *rl __maybe_unused, const void *entry)
 {
 	struct mep *me = malloc(sizeof(struct mep));
 
 	if (!me)
 		return NULL;
+
 	memcpy(me, entry, sizeof(struct mep));
-	me->name = strdup(me->name);
-	if (!me->name)
-		goto out_me;
-	me->metrics = strlist__new(NULL, NULL);
-	if (!me->metrics)
-		goto out_name;
 	return &me->nd;
-out_name:
-	zfree(&me->name);
-out_me:
+}
+
+static void mep_delete(struct rblist *rl __maybe_unused,
+		       struct rb_node *nd)
+{
+	struct mep *me = container_of(nd, struct mep, nd);
+
+	zfree(&me->metric_group);
 	free(me);
-	return NULL;
 }
 
-static struct mep *mep_lookup(struct rblist *groups, const char *name)
+static struct mep *mep_lookup(struct rblist *groups, const char *metric_group,
+			      const char *metric_name)
 {
 	struct rb_node *nd;
 	struct mep me = {
-		.name = name
+		.metric_group = strdup(metric_group),
+		.metric_name = metric_name,
 	};
 	nd = rblist__find(groups, &me);
-	if (nd)
+	if (nd) {
+		free(me.metric_group);
 		return container_of(nd, struct mep, nd);
+	}
 	rblist__add_node(groups, &me);
 	nd = rblist__find(groups, &me);
 	if (nd)
@@ -404,107 +420,37 @@ static struct mep *mep_lookup(struct rblist *groups, const char *name)
 	return NULL;
 }
 
-static void mep_delete(struct rblist *rl __maybe_unused,
-		       struct rb_node *nd)
-{
-	struct mep *me = container_of(nd, struct mep, nd);
-
-	strlist__delete(me->metrics);
-	zfree(&me->name);
-	free(me);
-}
-
-static void metricgroup__print_strlist(struct strlist *metrics, bool raw)
-{
-	struct str_node *sn;
-	int n = 0;
-
-	strlist__for_each_entry (sn, metrics) {
-		if (raw)
-			printf("%s%s", n > 0 ? " " : "", sn->s);
-		else
-			printf("  %s\n", sn->s);
-		n++;
-	}
-	if (raw)
-		putchar('\n');
-}
-
-static int metricgroup__print_pmu_event(const struct pmu_event *pe,
-					bool metricgroups, char *filter,
-					bool raw, bool details,
-					struct rblist *groups,
-					struct strlist *metriclist)
+static int metricgroup__add_to_mep_groups(const struct pmu_event *pe,
+					struct rblist *groups)
 {
 	const char *g;
 	char *omg, *mg;
 
-	g = pe->metric_group;
-	if (!g && pe->metric_name) {
-		if (pe->name)
-			return 0;
-		g = "No_group";
-	}
-
-	if (!g)
-		return 0;
-
-	mg = strdup(g);
-
+	mg = strdup(pe->metric_group ?: "No_group");
 	if (!mg)
 		return -ENOMEM;
 	omg = mg;
 	while ((g = strsep(&mg, ";")) != NULL) {
 		struct mep *me;
-		char *s;
 
 		g = skip_spaces(g);
-		if (*g == 0)
-			g = "No_group";
-		if (filter && !strstr(g, filter))
-			continue;
-		if (raw)
-			s = (char *)pe->metric_name;
-		else {
-			if (asprintf(&s, "%s\n%*s%s]",
-				     pe->metric_name, 8, "[", pe->desc) < 0)
-				return -1;
-			if (details) {
-				if (asprintf(&s, "%s\n%*s%s]",
-					     s, 8, "[", pe->metric_expr) < 0)
-					return -1;
-			}
-		}
-
-		if (!s)
-			continue;
+		if (strlen(g))
+			me = mep_lookup(groups, g, pe->metric_name);
+		else
+			me = mep_lookup(groups, "No_group", pe->metric_name);
 
-		if (!metricgroups) {
-			strlist__add(metriclist, s);
-		} else {
-			me = mep_lookup(groups, g);
-			if (!me)
-				continue;
-			strlist__add(me->metrics, s);
+		if (me) {
+			me->metric_desc = pe->desc;
+			me->metric_long_desc = pe->long_desc;
+			me->metric_expr = pe->metric_expr;
+			me->metric_unit = pe->unit;
 		}
-
-		if (!raw)
-			free(s);
 	}
 	free(omg);
 
 	return 0;
 }
 
-struct metricgroup_print_sys_idata {
-	struct strlist *metriclist;
-	char *filter;
-	struct rblist *groups;
-	bool metricgroups;
-	bool raw;
-	bool details;
-};
-
 struct metricgroup_iter_data {
 	pmu_event_iter_fn fn;
 	void *data;
@@ -527,60 +473,26 @@ static int metricgroup__sys_event_iter(const struct pmu_event *pe,
 
 		return d->fn(pe, table, d->data);
 	}
-
 	return 0;
 }
 
-static int metricgroup__print_sys_event_iter(const struct pmu_event *pe,
-					     const struct pmu_events_table *table __maybe_unused,
-					     void *data)
-{
-	struct metricgroup_print_sys_idata *d = data;
-
-	return metricgroup__print_pmu_event(pe, d->metricgroups, d->filter, d->raw,
-				     d->details, d->groups, d->metriclist);
-}
-
-struct metricgroup_print_data {
-	const char *pmu_name;
-	struct strlist *metriclist;
-	char *filter;
-	struct rblist *groups;
-	bool metricgroups;
-	bool raw;
-	bool details;
-};
-
-static int metricgroup__print_callback(const struct pmu_event *pe,
-				       const struct pmu_events_table *table __maybe_unused,
-				       void *vdata)
+static int metricgroup__add_to_mep_groups_callback(const struct pmu_event *pe,
+						const struct pmu_events_table *table __maybe_unused,
+						void *vdata)
 {
-	struct metricgroup_print_data *data = vdata;
+	struct rblist *groups = vdata;
 
-	if (!pe->metric_expr)
-		return 0;
-
-	if (data->pmu_name && perf_pmu__is_hybrid(pe->pmu) && strcmp(data->pmu_name, pe->pmu))
+	if (!pe->metric_name)
 		return 0;
 
-	return metricgroup__print_pmu_event(pe, data->metricgroups, data->filter,
-					    data->raw, data->details, data->groups,
-					    data->metriclist);
+	return metricgroup__add_to_mep_groups(pe, groups);
 }
 
-void metricgroup__print(bool metrics, bool metricgroups, char *filter,
-			bool raw, bool details, const char *pmu_name)
+void metricgroup__print(const struct print_callbacks *print_cb, void *print_state)
 {
 	struct rblist groups;
-	struct rb_node *node, *next;
-	struct strlist *metriclist = NULL;
 	const struct pmu_events_table *table;
-
-	if (!metricgroups) {
-		metriclist = strlist__new(NULL, NULL);
-		if (!metriclist)
-			return;
-	}
+	struct rb_node *node, *next;
 
 	rblist__init(&groups);
 	groups.node_new = mep_new;
@@ -588,56 +500,31 @@ void metricgroup__print(bool metrics, bool metricgroups, char *filter,
 	groups.node_delete = mep_delete;
 	table = pmu_events_table__find();
 	if (table) {
-		struct metricgroup_print_data data = {
-			.pmu_name = pmu_name,
-			.metriclist = metriclist,
-			.metricgroups = metricgroups,
-			.filter = filter,
-			.raw = raw,
-			.details = details,
-			.groups = &groups,
-		};
-
 		pmu_events_table_for_each_event(table,
-						metricgroup__print_callback,
-						&data);
+						metricgroup__add_to_mep_groups_callback,
+						&groups);
 	}
 	{
 		struct metricgroup_iter_data data = {
-			.fn = metricgroup__print_sys_event_iter,
-			.data = (void *) &(struct metricgroup_print_sys_idata){
-				.metriclist = metriclist,
-				.metricgroups = metricgroups,
-				.filter = filter,
-				.raw = raw,
-				.details = details,
-				.groups = &groups,
-			},
+			.fn = metricgroup__add_to_mep_groups_callback,
+			.data = &groups,
 		};
-
 		pmu_for_each_sys_event(metricgroup__sys_event_iter, &data);
 	}
 
-	if (!filter || !rblist__empty(&groups)) {
-		if (metricgroups && !raw)
-			printf("\nMetric Groups:\n\n");
-		else if (metrics && !raw)
-			printf("\nMetrics:\n\n");
-	}
-
 	for (node = rb_first_cached(&groups.entries); node; node = next) {
 		struct mep *me = container_of(node, struct mep, nd);
 
-		if (metricgroups)
-			printf("%s%s%s", me->name, metrics && !raw ? ":" : "", raw ? " " : "\n");
-		if (metrics)
-			metricgroup__print_strlist(me->metrics, raw);
+		print_cb->print_metric(print_state,
+				me->metric_group,
+				me->metric_name,
+				me->metric_desc,
+				me->metric_long_desc,
+				me->metric_expr,
+				me->metric_unit);
 		next = rb_next(node);
 		rblist__remove_node(&groups, node);
 	}
-	if (!metricgroups)
-		metricgroup__print_strlist(metriclist, raw);
-	strlist__delete(metriclist);
 }
 
 static const char *code_characters = ",-=@";
@@ -764,7 +651,7 @@ static int metricgroup__build_event_string(struct strbuf *events,
 #define RETURN_IF_NON_ZERO(x) do { if (x) return x; } while (0)
 
 	hashmap__for_each_entry(ctx->ids, cur, bkt) {
-		const char *sep, *rsep, *id = cur->key;
+		const char *sep, *rsep, *id = cur->pkey;
 		enum perf_tool_event ev;
 
 		pr_debug("found event %s\n", id);
@@ -945,14 +832,14 @@ static int resolve_metric(struct list_head *metric_list,
 	hashmap__for_each_entry(root_metric->pctx->ids, cur, bkt) {
 		struct pmu_event pe;
 
-		if (metricgroup__find_metric(cur->key, table, &pe)) {
+		if (metricgroup__find_metric(cur->pkey, table, &pe)) {
 			pending = realloc(pending,
 					(pending_cnt + 1) * sizeof(struct to_resolve));
 			if (!pending)
 				return -ENOMEM;
 
 			memcpy(&pending[pending_cnt].pe, &pe, sizeof(pe));
-			pending[pending_cnt].key = cur->key;
+			pending[pending_cnt].key = cur->pkey;
 			pending_cnt++;
 		}
 	}
@@ -1433,7 +1320,7 @@ static int build_combined_expr_ctx(const struct list_head *metric_list,
 	list_for_each_entry(m, metric_list, nd) {
 		if (m->has_constraint && !m->modifier) {
 			hashmap__for_each_entry(m->pctx->ids, cur, bkt) {
-				dup = strdup(cur->key);
+				dup = strdup(cur->pkey);
 				if (!dup) {
 					ret = -ENOMEM;
 					goto err_out;
diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
index 732d3a0d3334..0013cf582173 100644
--- a/tools/perf/util/metricgroup.h
+++ b/tools/perf/util/metricgroup.h
@@ -10,6 +10,7 @@
 struct evlist;
 struct evsel;
 struct option;
+struct print_callbacks;
 struct rblist;
 struct cgroup;
 
@@ -78,8 +79,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist,
 				   bool metric_no_merge,
 				   struct rblist *metric_events);
 
-void metricgroup__print(bool metrics, bool groups, char *filter,
-			bool raw, bool details, const char *pmu_name);
+void metricgroup__print(const struct print_callbacks *print_cb, void *print_state);
 bool metricgroup__has_metric(const char *metric);
 int arch_get_runtimeparam(const struct pmu_event *pe __maybe_unused);
 void metricgroup__rblist_exit(struct rblist *metric_events);
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index a4dff881be39..49093b21ee2d 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -111,7 +111,7 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, struct perf_cpu cpu, i
 			pr_err("Failed to allocate node mask for mbind: error %m\n");
 			return -1;
 		}
-		set_bit(node_index, node_mask);
+		__set_bit(node_index, node_mask);
 		if (mbind(data, mmap_len, MPOL_BIND, node_mask, node_index + 1 + 1, 0)) {
 			pr_err("Failed to bind [%p-%p] AIO buffer to node %lu: error %m\n",
 				data, data + mmap_len, node_index);
@@ -256,7 +256,7 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask)
 	for (idx = 0; idx < nr_cpus; idx++) {
 		cpu = perf_cpu_map__cpu(cpu_map, idx); /* map c index to online cpu index */
 		if (cpu__get_node(cpu) == node)
-			set_bit(cpu.cpu, mask->bits);
+			__set_bit(cpu.cpu, mask->bits);
 	}
 }
 
@@ -270,7 +270,7 @@ static int perf_mmap__setup_affinity_mask(struct mmap *map, struct mmap_params *
 	if (mp->affinity == PERF_AFFINITY_NODE && cpu__max_node() > 1)
 		build_node_mask(cpu__get_node(map->core.cpu), &map->affinity_mask);
 	else if (mp->affinity == PERF_AFFINITY_CPU)
-		set_bit(map->core.cpu.cpu, map->affinity_mask.bits);
+		__set_bit(map->core.cpu.cpu, map->affinity_mask.bits);
 
 	return 0;
 }
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index cd4ccec7f361..f944c3cd5efa 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -2,18 +2,13 @@
 #define __PERF_MMAP_H 1
 
 #include <internal/mmap.h>
-#include <linux/compiler.h>
-#include <linux/refcount.h>
 #include <linux/types.h>
-#include <linux/ring_buffer.h>
 #include <linux/bitops.h>
 #include <perf/cpumap.h>
-#include <stdbool.h>
 #ifdef HAVE_AIO_SUPPORT
 #include <aio.h>
 #endif
 #include "auxtrace.h"
-#include "event.h"
 #include "util/compress.h"
 
 struct aiocb;
diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c
index 31faf2bb49ff..fd67d204d720 100644
--- a/tools/perf/util/parse-branch-options.c
+++ b/tools/perf/util/parse-branch-options.c
@@ -30,8 +30,11 @@ static const struct branch_mode branch_modes[] = {
 	BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
 	BRANCH_OPT("ind_jmp", PERF_SAMPLE_BRANCH_IND_JUMP),
 	BRANCH_OPT("call", PERF_SAMPLE_BRANCH_CALL),
+	BRANCH_OPT("no_flags", PERF_SAMPLE_BRANCH_NO_FLAGS),
+	BRANCH_OPT("no_cycles", PERF_SAMPLE_BRANCH_NO_CYCLES),
 	BRANCH_OPT("save_type", PERF_SAMPLE_BRANCH_TYPE_SAVE),
 	BRANCH_OPT("stack", PERF_SAMPLE_BRANCH_CALL_STACK),
+	BRANCH_OPT("hw_index", PERF_SAMPLE_BRANCH_HW_INDEX),
 	BRANCH_OPT("priv", PERF_SAMPLE_BRANCH_PRIV_SAVE),
 	BRANCH_END
 };
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5973f46c2375..21cce83462b3 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -266,6 +266,7 @@ __add_event(struct list_head *list, int *idx,
 	evsel->core.own_cpus = perf_cpu_map__get(cpus);
 	evsel->core.requires_cpu = pmu ? pmu->is_uncore : false;
 	evsel->auto_merge_stats = auto_merge_stats;
+	evsel->pmu = pmu;
 
 	if (name)
 		evsel->name = strdup(name);
@@ -444,6 +445,7 @@ out_free_terms:
 	return ret;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static void tracepoint_error(struct parse_events_error *e, int err,
 			     const char *sys, const char *name)
 {
@@ -592,6 +594,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
 	closedir(events_dir);
 	return ret;
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 #ifdef HAVE_LIBBPF_SUPPORT
 struct __add_bpf_event_param {
@@ -1142,6 +1145,7 @@ static int config_term_pmu(struct perf_event_attr *attr,
 		return config_term_common(attr, term, err);
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int config_term_tracepoint(struct perf_event_attr *attr,
 				  struct parse_events_term *term,
 				  struct parse_events_error *err)
@@ -1169,6 +1173,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
 
 	return 0;
 }
+#endif
 
 static int config_attr(struct perf_event_attr *attr,
 		       struct list_head *head,
@@ -1324,6 +1329,7 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
 				struct parse_events_error *err,
 				struct list_head *head_config)
 {
+#ifdef HAVE_LIBTRACEEVENT
 	if (head_config) {
 		struct perf_event_attr attr;
 
@@ -1338,6 +1344,16 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
 	else
 		return add_tracepoint_event(list, idx, sys, event,
 					    err, head_config);
+#else
+	(void)list;
+	(void)idx;
+	(void)sys;
+	(void)event;
+	(void)head_config;
+	parse_events_error__handle(err, 0, strdup("unsupported tracepoint"),
+				strdup("libtraceevent is necessary for tracepoint support"));
+	return -1;
+#endif
 }
 
 int parse_events_add_numeric(struct parse_events_state *parse_state,
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 07df7bb7b042..428e72eaafcc 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -18,7 +18,6 @@ struct parse_events_error;
 struct option;
 struct perf_pmu;
 
-bool have_tracepoints(struct list_head *evlist);
 bool is_event_supported(u8 type, u64 config);
 
 const char *event_type(int type);
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 872dd3d38782..57a567ee2cea 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -2,7 +2,7 @@
 #include <errno.h>
 #include <string.h>
 #include "perf_regs.h"
-#include "event.h"
+#include "util/sample.h"
 
 int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused,
 				 char **new_op __maybe_unused)
diff --git a/tools/perf/util/pfm.c b/tools/perf/util/pfm.c
index f0bcfcab1a93..ac3227ba769c 100644
--- a/tools/perf/util/pfm.c
+++ b/tools/perf/util/pfm.c
@@ -12,6 +12,7 @@
 #include "util/parse-events.h"
 #include "util/pmu.h"
 #include "util/pfm.h"
+#include "util/strbuf.h"
 
 #include <string.h>
 #include <linux/kernel.h>
@@ -130,53 +131,36 @@ static const char *srcs[PFM_ATTR_CTRL_MAX] = {
 };
 
 static void
-print_attr_flags(pfm_event_attr_info_t *info)
+print_attr_flags(struct strbuf *buf, const pfm_event_attr_info_t *info)
 {
-	int n = 0;
+	if (info->is_dfl)
+		strbuf_addf(buf, "[default] ");
 
-	if (info->is_dfl) {
-		printf("[default] ");
-		n++;
-	}
-
-	if (info->is_precise) {
-		printf("[precise] ");
-		n++;
-	}
-
-	if (!n)
-		printf("- ");
+	if (info->is_precise)
+		strbuf_addf(buf, "[precise] ");
 }
 
 static void
-print_libpfm_events_detailed(pfm_event_info_t *info, bool long_desc)
+print_libpfm_event(const struct print_callbacks *print_cb, void *print_state,
+		const pfm_pmu_info_t *pinfo, const pfm_event_info_t *info,
+		struct strbuf *buf)
 {
-	pfm_event_attr_info_t ainfo;
-	const char *src;
 	int j, ret;
+	char topic[80], name[80];
 
-	ainfo.size = sizeof(ainfo);
+	strbuf_setlen(buf, 0);
+	snprintf(topic, sizeof(topic), "pfm %s", pinfo->name);
 
-	printf("  %s\n", info->name);
-	printf("    [%s]\n", info->desc);
-	if (long_desc) {
-		if (info->equiv)
-			printf("      Equiv: %s\n", info->equiv);
+	snprintf(name, sizeof(name), "%s::%s", pinfo->name, info->name);
+	strbuf_addf(buf, "Code: 0x%"PRIx64"\n", info->code);
 
-		printf("      Code  : 0x%"PRIx64"\n", info->code);
-	}
 	pfm_for_each_event_attr(j, info) {
-		ret = pfm_get_event_attr_info(info->idx, j,
-					      PFM_OS_PERF_EVENT_EXT, &ainfo);
-		if (ret != PFM_SUCCESS)
-			continue;
-
-		if (ainfo.type == PFM_ATTR_UMASK) {
-			printf("      %s:%s\n", info->name, ainfo.name);
-			printf("        [%s]\n", ainfo.desc);
-		}
+		pfm_event_attr_info_t ainfo;
+		const char *src;
 
-		if (!long_desc)
+		ainfo.size = sizeof(ainfo);
+		ret = pfm_get_event_attr_info(info->idx, j, PFM_OS_PERF_EVENT_EXT, &ainfo);
+		if (ret != PFM_SUCCESS)
 			continue;
 
 		if (ainfo.ctrl >= PFM_ATTR_CTRL_MAX)
@@ -184,64 +168,74 @@ print_libpfm_events_detailed(pfm_event_info_t *info, bool long_desc)
 
 		src = srcs[ainfo.ctrl];
 		switch (ainfo.type) {
-		case PFM_ATTR_UMASK:
-			printf("        Umask : 0x%02"PRIx64" : %s: ",
-				ainfo.code, src);
-			print_attr_flags(&ainfo);
-			putchar('\n');
+		case PFM_ATTR_UMASK: /* Ignore for now */
 			break;
 		case PFM_ATTR_MOD_BOOL:
-			printf("      Modif : %s: [%s] : %s (boolean)\n", src,
-				ainfo.name, ainfo.desc);
+			strbuf_addf(buf, " Modif: %s: [%s] : %s (boolean)\n", src,
+				    ainfo.name, ainfo.desc);
 			break;
 		case PFM_ATTR_MOD_INTEGER:
-			printf("      Modif : %s: [%s] : %s (integer)\n", src,
-				ainfo.name, ainfo.desc);
+			strbuf_addf(buf, " Modif: %s: [%s] : %s (integer)\n", src,
+				    ainfo.name, ainfo.desc);
 			break;
 		case PFM_ATTR_NONE:
 		case PFM_ATTR_RAW_UMASK:
 		case PFM_ATTR_MAX:
 		default:
-			printf("      Attr  : %s: [%s] : %s\n", src,
-				ainfo.name, ainfo.desc);
+			strbuf_addf(buf, " Attr: %s: [%s] : %s\n", src,
+				    ainfo.name, ainfo.desc);
 		}
 	}
-}
+	print_cb->print_event(print_state,
+			pinfo->name,
+			topic,
+			name, info->equiv,
+			/*scale_unit=*/NULL,
+			/*deprecated=*/NULL, "PFM event",
+			info->desc, /*long_desc=*/NULL,
+			/*encoding_desc=*/buf->buf,
+			/*metric_name=*/NULL, /*metric_expr=*/NULL);
 
-/*
- * list all pmu::event:umask, pmu::event
- * printed events may not be all valid combinations of umask for an event
- */
-static void
-print_libpfm_events_raw(pfm_pmu_info_t *pinfo, pfm_event_info_t *info)
-{
-	pfm_event_attr_info_t ainfo;
-	int j, ret;
-	bool has_umask = false;
+	pfm_for_each_event_attr(j, info) {
+		pfm_event_attr_info_t ainfo;
+		const char *src;
 
-	ainfo.size = sizeof(ainfo);
+		strbuf_setlen(buf, 0);
 
-	pfm_for_each_event_attr(j, info) {
-		ret = pfm_get_event_attr_info(info->idx, j,
-					      PFM_OS_PERF_EVENT_EXT, &ainfo);
+		ainfo.size = sizeof(ainfo);
+		ret = pfm_get_event_attr_info(info->idx, j, PFM_OS_PERF_EVENT_EXT, &ainfo);
 		if (ret != PFM_SUCCESS)
 			continue;
 
-		if (ainfo.type != PFM_ATTR_UMASK)
-			continue;
+		if (ainfo.ctrl >= PFM_ATTR_CTRL_MAX)
+			ainfo.ctrl = PFM_ATTR_CTRL_UNKNOWN;
 
-		printf("%s::%s:%s\n", pinfo->name, info->name, ainfo.name);
-		has_umask = true;
+		src = srcs[ainfo.ctrl];
+		if (ainfo.type == PFM_ATTR_UMASK) {
+			strbuf_addf(buf, "Umask: 0x%02"PRIx64" : %s: ",
+				ainfo.code, src);
+			print_attr_flags(buf, &ainfo);
+			snprintf(name, sizeof(name), "%s::%s:%s",
+				 pinfo->name, info->name, ainfo.name);
+			print_cb->print_event(print_state,
+					pinfo->name,
+					topic,
+					name, /*alias=*/NULL,
+					/*scale_unit=*/NULL,
+					/*deprecated=*/NULL, "PFM event",
+					ainfo.desc, /*long_desc=*/NULL,
+					/*encoding_desc=*/buf->buf,
+					/*metric_name=*/NULL, /*metric_expr=*/NULL);
+		}
 	}
-	if (!has_umask)
-		printf("%s::%s\n", pinfo->name, info->name);
 }
 
-void print_libpfm_events(bool name_only, bool long_desc)
+void print_libpfm_events(const struct print_callbacks *print_cb, void *print_state)
 {
 	pfm_event_info_t info;
 	pfm_pmu_info_t pinfo;
-	int i, p, ret;
+	int p, ret;
+	struct strbuf storage;
 
 	libpfm_initialize();
 
@@ -249,12 +243,9 @@ void print_libpfm_events(bool name_only, bool long_desc)
 	info.size  = sizeof(info);
 	pinfo.size = sizeof(pinfo);
 
-	if (!name_only)
-		puts("\nList of pre-defined events (to be used in --pfm-events):\n");
+	strbuf_init(&storage, 2048);
 
 	pfm_for_all_pmus(p) {
-		bool printed_pmu = false;
-
 		ret = pfm_get_pmu_info(p, &pinfo);
 		if (ret != PFM_SUCCESS)
 			continue;
@@ -267,25 +258,14 @@ void print_libpfm_events(bool name_only, bool long_desc)
 		if (pinfo.pmu == PFM_PMU_PERF_EVENT)
 			continue;
 
-		for (i = pinfo.first_event; i != -1;
-		     i = pfm_get_event_next(i)) {
-
+		for (int i = pinfo.first_event; i != -1; i = pfm_get_event_next(i)) {
 			ret = pfm_get_event_info(i, PFM_OS_PERF_EVENT_EXT,
 						&info);
 			if (ret != PFM_SUCCESS)
 				continue;
 
-			if (!name_only && !printed_pmu) {
-				printf("%s:\n", pinfo.name);
-				printed_pmu = true;
-			}
-
-			if (!name_only)
-				print_libpfm_events_detailed(&info, long_desc);
-			else
-				print_libpfm_events_raw(&pinfo, &info);
+			print_libpfm_event(print_cb, print_state, &pinfo, &info, &storage);
 		}
-		if (!name_only && printed_pmu)
-			putchar('\n');
 	}
+	strbuf_release(&storage);
 }
diff --git a/tools/perf/util/pfm.h b/tools/perf/util/pfm.h
index 7d70dda87012..fb25c2749d26 100644
--- a/tools/perf/util/pfm.h
+++ b/tools/perf/util/pfm.h
@@ -7,13 +7,14 @@
 #ifndef __PERF_PFM_H
 #define __PERF_PFM_H
 
+#include "print-events.h"
 #include <subcmd/parse-options.h>
 
 #ifdef HAVE_LIBPFM
 int parse_libpfm_events_option(const struct option *opt, const char *str,
 			int unset);
 
-void print_libpfm_events(bool name_only, bool long_desc);
+void print_libpfm_events(const struct print_callbacks *print_cb, void *print_state);
 
 #else
 #include <linux/compiler.h>
@@ -26,8 +27,8 @@ static inline int parse_libpfm_events_option(
 	return 0;
 }
 
-static inline void print_libpfm_events(bool name_only __maybe_unused,
-				       bool long_desc __maybe_unused)
+static inline void print_libpfm_events(const struct print_callbacks *print_cb __maybe_unused,
+				       void *print_state __maybe_unused)
 {
 }
 
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 03284059175f..2bdeb89352e7 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -22,7 +22,9 @@
 #include "debug.h"
 #include "evsel.h"
 #include "pmu.h"
+#include "pmus.h"
 #include "parse-events.h"
+#include "print-events.h"
 #include "header.h"
 #include "string2.h"
 #include "strbuf.h"
@@ -31,17 +33,32 @@
 
 struct perf_pmu perf_pmu__fake;
 
+/**
+ * struct perf_pmu_format - Values from a format file read from
+ * <sysfs>/devices/cpu/format/ held in struct perf_pmu.
+ *
+ * For example, the contents of <sysfs>/devices/cpu/format/event may be
+ * "config:0-7" and will be represented here as name="event",
+ * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set.
+ */
 struct perf_pmu_format {
+	/** @name: The modifier/file name. */
 	char *name;
+	/**
+	 * @value : Which config value the format relates to. Supported values
+	 * are from PERF_PMU_FORMAT_VALUE_CONFIG to
+	 * PERF_PMU_FORMAT_VALUE_CONFIG_END.
+	 */
 	int value;
+	/** @bits: Which config bits are set by this format value. */
 	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
+	/** @list: Element on list within struct perf_pmu. */
 	struct list_head list;
 };
 
 int perf_pmu_parse(struct list_head *list, char *name);
 extern FILE *perf_pmu_in;
 
-static LIST_HEAD(pmus);
 static bool hybrid_scanned;
 
 /*
@@ -980,7 +997,6 @@ static struct perf_pmu *pmu_lookup(const char *lookup_name)
 	pmu->is_uncore = pmu_is_uncore(name);
 	if (pmu->is_uncore)
 		pmu->id = pmu_id(name);
-	pmu->is_hybrid = is_hybrid;
 	pmu->max_precise = pmu_max_precise(name);
 	pmu_add_cpu_aliases(&aliases, pmu);
 	pmu_add_sys_aliases(&aliases, pmu);
@@ -992,7 +1008,7 @@ static struct perf_pmu *pmu_lookup(const char *lookup_name)
 	list_splice(&aliases, &pmu->aliases);
 	list_add_tail(&pmu->list, &pmus);
 
-	if (pmu->is_hybrid)
+	if (is_hybrid)
 		list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus);
 
 	pmu->default_config = perf_pmu__get_default_config(pmu);
@@ -1065,11 +1081,15 @@ struct perf_pmu *evsel__find_pmu(struct evsel *evsel)
 {
 	struct perf_pmu *pmu = NULL;
 
+	if (evsel->pmu)
+		return evsel->pmu;
+
 	while ((pmu = perf_pmu__scan(pmu)) != NULL) {
 		if (pmu->type == evsel->core.attr.type)
 			break;
 	}
 
+	evsel->pmu = pmu;
 	return pmu;
 }
 
@@ -1513,7 +1533,7 @@ void perf_pmu__set_format(unsigned long *bits, long from, long to)
 
 	memset(bits, 0, BITS_TO_BYTES(PERF_PMU_FORMAT_BITS));
 	for (b = from; b <= to; b++)
-		set_bit(b, bits);
+		__set_bit(b, bits);
 }
 
 void perf_pmu__del_formats(struct list_head *formats)
@@ -1534,8 +1554,8 @@ static int sub_non_neg(int a, int b)
 	return a - b;
 }
 
-static char *format_alias(char *buf, int len, struct perf_pmu *pmu,
-			  struct perf_pmu_alias *alias)
+static char *format_alias(char *buf, int len, const struct perf_pmu *pmu,
+			  const struct perf_pmu_alias *alias)
 {
 	struct parse_events_term *term;
 	int used = snprintf(buf, len, "%s/%s", pmu->name, alias->name);
@@ -1560,72 +1580,60 @@ static char *format_alias(char *buf, int len, struct perf_pmu *pmu,
 	return buf;
 }
 
-static char *format_alias_or(char *buf, int len, struct perf_pmu *pmu,
-			     struct perf_pmu_alias *alias)
-{
-	snprintf(buf, len, "%s OR %s/%s/", alias->name, pmu->name, alias->name);
-	return buf;
-}
-
+/** Struct for ordering events as output in perf list. */
 struct sevent {
-	char *name;
-	char *desc;
-	char *topic;
-	char *str;
-	char *pmu;
-	char *metric_expr;
-	char *metric_name;
-	int is_cpu;
+	/** PMU for event. */
+	const struct perf_pmu *pmu;
+	/**
+	 * Optional event for name, desc, etc. If not present then this is a
+	 * selectable PMU and the event name is shown as "//".
+	 */
+	const struct perf_pmu_alias *event;
+	/** Is the PMU for the CPU? */
+	bool is_cpu;
 };
 
 static int cmp_sevent(const void *a, const void *b)
 {
 	const struct sevent *as = a;
 	const struct sevent *bs = b;
+	const char *a_pmu_name, *b_pmu_name;
+	const char *a_name = "//", *a_desc = NULL, *a_topic = "";
+	const char *b_name = "//", *b_desc = NULL, *b_topic = "";
 	int ret;
 
-	/* Put extra events last */
-	if (!!as->desc != !!bs->desc)
-		return !!as->desc - !!bs->desc;
-	if (as->topic && bs->topic) {
-		int n = strcmp(as->topic, bs->topic);
-
-		if (n)
-			return n;
+	if (as->event) {
+		a_name = as->event->name;
+		a_desc = as->event->desc;
+		a_topic = as->event->topic ?: "";
 	}
-
-	/* Order CPU core events to be first */
-	if (as->is_cpu != bs->is_cpu)
-		return bs->is_cpu - as->is_cpu;
-
-	ret = strcmp(as->name, bs->name);
-	if (!ret) {
-		if (as->pmu && bs->pmu)
-			return strcmp(as->pmu, bs->pmu);
+	if (bs->event) {
+		b_name = bs->event->name;
+		b_desc = bs->event->desc;
+		b_topic = bs->event->topic ?: "";
 	}
+	/* Put extra events last. */
+	if (!!a_desc != !!b_desc)
+		return !!a_desc - !!b_desc;
 
-	return ret;
-}
+	/* Order by topics. */
+	ret = strcmp(a_topic, b_topic);
+	if (ret)
+		return ret;
 
-static void wordwrap(char *s, int start, int max, int corr)
-{
-	int column = start;
-	int n;
+	/* Order CPU core events to be first */
+	if (as->is_cpu != bs->is_cpu)
+		return as->is_cpu ? -1 : 1;
 
-	while (*s) {
-		int wlen = strcspn(s, " \t");
+	/* Order by PMU name. */
+	a_pmu_name = as->pmu->name ?: "";
+	b_pmu_name = bs->pmu->name ?: "";
+	ret = strcmp(a_pmu_name, b_pmu_name);
+	if (ret)
+		return ret;
 
-		if (column + wlen >= max && column > start) {
-			printf("\n%*s", start, "");
-			column = start + corr;
-		}
-		n = printf("%s%.*s", column > start ? " " : "", wlen, s);
-		if (n <= 0)
-			break;
-		s += wlen;
-		column += n;
-		s = skip_spaces(s);
-	}
+	/* Order by event name. */
+	return strcmp(a_name, b_name);
 }
 
 bool is_pmu_core(const char *name)
@@ -1636,147 +1644,127 @@ bool is_pmu_core(const char *name)
 static bool pmu_alias_is_duplicate(struct sevent *alias_a,
 				   struct sevent *alias_b)
 {
-	/* Different names -> never duplicates */
-	if (strcmp(alias_a->name, alias_b->name))
-		return false;
+	const char *a_pmu_name, *b_pmu_name;
+	const char *a_name = alias_a->event ? alias_a->event->name : "//";
+	const char *b_name = alias_b->event ? alias_b->event->name : "//";
 
-	/* Don't remove duplicates for hybrid PMUs */
-	if (perf_pmu__is_hybrid(alias_a->pmu) &&
-	    perf_pmu__is_hybrid(alias_b->pmu))
+	/* Different names -> never duplicates */
+	if (strcmp(a_name, b_name))
 		return false;
 
-	return true;
+	/* Don't remove duplicates for different PMUs */
+	a_pmu_name = alias_a->pmu->name ?: "";
+	b_pmu_name = alias_b->pmu->name ?: "";
+	return strcmp(a_pmu_name, b_pmu_name) == 0;
 }
 
-void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
-			bool long_desc, bool details_flag, bool deprecated,
-			const char *pmu_name)
+void print_pmu_events(const struct print_callbacks *print_cb, void *print_state)
 {
 	struct perf_pmu *pmu;
-	struct perf_pmu_alias *alias;
+	struct perf_pmu_alias *event;
 	char buf[1024];
 	int printed = 0;
 	int len, j;
 	struct sevent *aliases;
-	int numdesc = 0;
-	int columns = pager_get_columns();
-	char *topic = NULL;
 
 	pmu = NULL;
 	len = 0;
 	while ((pmu = perf_pmu__scan(pmu)) != NULL) {
-		list_for_each_entry(alias, &pmu->aliases, list)
+		list_for_each_entry(event, &pmu->aliases, list)
 			len++;
 		if (pmu->selectable)
 			len++;
 	}
 	aliases = zalloc(sizeof(struct sevent) * len);
-	if (!aliases)
-		goto out_enomem;
+	if (!aliases) {
+		pr_err("FATAL: not enough memory to print PMU events\n");
+		return;
+	}
 	pmu = NULL;
 	j = 0;
 	while ((pmu = perf_pmu__scan(pmu)) != NULL) {
-		if (pmu_name && perf_pmu__is_hybrid(pmu->name) &&
-		    strcmp(pmu_name, pmu->name)) {
-			continue;
-		}
-
-		list_for_each_entry(alias, &pmu->aliases, list) {
-			char *name = alias->desc ? alias->name :
-				format_alias(buf, sizeof(buf), pmu, alias);
-			bool is_cpu = is_pmu_core(pmu->name) ||
-				      perf_pmu__is_hybrid(pmu->name);
+		bool is_cpu = is_pmu_core(pmu->name) || perf_pmu__is_hybrid(pmu->name);
 
-			if (alias->deprecated && !deprecated)
-				continue;
-
-			if (event_glob != NULL &&
-			    !(strglobmatch_nocase(name, event_glob) ||
-			      (!is_cpu && strglobmatch_nocase(alias->name,
-						       event_glob)) ||
-			      (alias->topic &&
-			       strglobmatch_nocase(alias->topic, event_glob))))
-				continue;
-
-			if (is_cpu && !name_only && !alias->desc)
-				name = format_alias_or(buf, sizeof(buf), pmu, alias);
-
-			aliases[j].name = name;
-			if (is_cpu && !name_only && !alias->desc)
-				aliases[j].name = format_alias_or(buf,
-								  sizeof(buf),
-								  pmu, alias);
-			aliases[j].name = strdup(aliases[j].name);
-			if (!aliases[j].name)
-				goto out_enomem;
-
-			aliases[j].desc = long_desc ? alias->long_desc :
-						alias->desc;
-			aliases[j].topic = alias->topic;
-			aliases[j].str = alias->str;
-			aliases[j].pmu = pmu->name;
-			aliases[j].metric_expr = alias->metric_expr;
-			aliases[j].metric_name = alias->metric_name;
+		list_for_each_entry(event, &pmu->aliases, list) {
+			aliases[j].event = event;
+			aliases[j].pmu = pmu;
 			aliases[j].is_cpu = is_cpu;
 			j++;
 		}
-		if (pmu->selectable &&
-		    (event_glob == NULL || strglobmatch(pmu->name, event_glob))) {
-			char *s;
-			if (asprintf(&s, "%s//", pmu->name) < 0)
-				goto out_enomem;
-			aliases[j].name = s;
+		if (pmu->selectable) {
+			aliases[j].event = NULL;
+			aliases[j].pmu = pmu;
+			aliases[j].is_cpu = is_cpu;
 			j++;
 		}
 	}
 	len = j;
 	qsort(aliases, len, sizeof(struct sevent), cmp_sevent);
 	for (j = 0; j < len; j++) {
+		const char *name, *alias = NULL, *scale_unit = NULL,
+			*desc = NULL, *long_desc = NULL,
+			*encoding_desc = NULL, *topic = NULL,
+			*metric_name = NULL, *metric_expr = NULL;
+		bool deprecated = false;
+		size_t buf_used;
+
 		/* Skip duplicates */
 		if (j > 0 && pmu_alias_is_duplicate(&aliases[j], &aliases[j - 1]))
 			continue;
 
-		if (name_only) {
-			printf("%s ", aliases[j].name);
-			continue;
-		}
-		if (aliases[j].desc && !quiet_flag) {
-			if (numdesc++ == 0)
-				printf("\n");
-			if (aliases[j].topic && (!topic ||
-					strcmp(topic, aliases[j].topic))) {
-				printf("%s%s:\n", topic ? "\n" : "",
-						aliases[j].topic);
-				topic = aliases[j].topic;
+		if (!aliases[j].event) {
+			/* A selectable event. */
+			buf_used = snprintf(buf, sizeof(buf), "%s//", aliases[j].pmu->name) + 1;
+			name = buf;
+		} else {
+			if (aliases[j].event->desc) {
+				name = aliases[j].event->name;
+				buf_used = 0;
+			} else {
+				name = format_alias(buf, sizeof(buf), aliases[j].pmu,
+						    aliases[j].event);
+				if (aliases[j].is_cpu) {
+					alias = name;
+					name = aliases[j].event->name;
+				}
+				buf_used = strlen(buf) + 1;
 			}
-			printf("  %-50s\n", aliases[j].name);
-			printf("%*s", 8, "[");
-			wordwrap(aliases[j].desc, 8, columns, 0);
-			printf("]\n");
-			if (details_flag) {
-				printf("%*s%s/%s/ ", 8, "", aliases[j].pmu, aliases[j].str);
-				if (aliases[j].metric_name)
-					printf(" MetricName: %s", aliases[j].metric_name);
-				if (aliases[j].metric_expr)
-					printf(" MetricExpr: %s", aliases[j].metric_expr);
-				putchar('\n');
+			if (strlen(aliases[j].event->unit) || aliases[j].event->scale != 1.0) {
+				scale_unit = buf + buf_used;
+				buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used,
+						"%G%s", aliases[j].event->scale,
+						aliases[j].event->unit) + 1;
 			}
-		} else
-			printf("  %-50s [Kernel PMU event]\n", aliases[j].name);
-		printed++;
+			desc = aliases[j].event->desc;
+			long_desc = aliases[j].event->long_desc;
+			topic = aliases[j].event->topic;
+			encoding_desc = buf + buf_used;
+			buf_used += snprintf(buf + buf_used, sizeof(buf) - buf_used,
+					"%s/%s/", aliases[j].pmu->name,
+					aliases[j].event->str) + 1;
+			metric_name = aliases[j].event->metric_name;
+			metric_expr = aliases[j].event->metric_expr;
+			deprecated = aliases[j].event->deprecated;
+		}
+		print_cb->print_event(print_state,
+				aliases[j].pmu->name,
+				topic,
+				name,
+				alias,
+				scale_unit,
+				deprecated,
+				"Kernel PMU event",
+				desc,
+				long_desc,
+				encoding_desc,
+				metric_name,
+				metric_expr);
 	}
 	if (printed && pager_in_use())
 		printf("\n");
-out_free:
-	for (j = 0; j < len; j++)
-		zfree(&aliases[j].name);
+
 	zfree(&aliases);
 	return;
-
-out_enomem:
-	printf("FATAL: not enough memory to print PMU events\n");
-	if (aliases)
-		goto out_free;
 }
 
 bool pmu_have_event(const char *pname, const char *name)
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 68e15c38ae71..69ca0004f94f 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -12,6 +12,7 @@
 
 struct evsel_config_term;
 struct perf_cpu_map;
+struct print_callbacks;
 
 enum {
 	PERF_PMU_FORMAT_VALUE_CONFIG,
@@ -33,31 +34,101 @@ struct perf_pmu_caps {
 	struct list_head list;
 };
 
+/**
+ * struct perf_pmu - hi
+ */
 struct perf_pmu {
+	/** @name: The name of the PMU such as "cpu". */
 	char *name;
+	/**
+	 * @alias_name: Optional alternate name for the PMU determined in
+	 * architecture specific code.
+	 */
 	char *alias_name;
+	/**
+	 * @id: Optional PMU identifier read from
+	 * <sysfs>/bus/event_source/devices/<name>/identifier.
+	 */
 	char *id;
+	/**
+	 * @type: Perf event attributed type value, read from
+	 * <sysfs>/bus/event_source/devices/<name>/type.
+	 */
 	__u32 type;
+	/**
+	 * @selectable: Can the PMU name be selected as if it were an event?
+	 */
 	bool selectable;
+	/**
+	 * @is_uncore: Is the PMU not within the CPU core? Determined by the
+	 * presence of <sysfs>/bus/event_source/devices/<name>/cpumask.
+	 */
 	bool is_uncore;
-	bool is_hybrid;
+	/**
+	 * @auxtrace: Are events auxiliary events? Determined in architecture
+	 * specific code.
+	 */
 	bool auxtrace;
+	/**
+	 * @max_precise: Number of levels of :ppp precision supported by the
+	 * PMU, read from
+	 * <sysfs>/bus/event_source/devices/<name>/caps/max_precise.
+	 */
 	int max_precise;
+	/**
+	 * @default_config: Optional default perf_event_attr determined in
+	 * architecture specific code.
+	 */
 	struct perf_event_attr *default_config;
+	/**
+	 * @cpus: Empty or the contents of either of:
+	 * <sysfs>/bus/event_source/devices/<name>/cpumask.
+	 * <sysfs>/bus/event_source/devices/<cpu>/cpus.
+	 */
 	struct perf_cpu_map *cpus;
-	struct list_head format;  /* HEAD struct perf_pmu_format -> list */
-	struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */
+	/**
+	 * @format: Holds the contents of files read from
+	 * <sysfs>/bus/event_source/devices/<name>/format/. The contents specify
+	 * which event parameter changes what config, config1 or config2 bits.
+	 */
+	struct list_head format;
+	/**
+	 * @aliases: List of struct perf_pmu_alias. Each alias corresponds to an
+	 * event read from <sysfs>/bus/event_source/devices/<name>/events/ or
+	 * from json events in pmu-events.c.
+	 */
+	struct list_head aliases;
+	/** @caps_initialized: Has the list caps been initialized? */
 	bool caps_initialized;
+	/** @nr_caps: The length of the list caps. */
 	u32 nr_caps;
-	struct list_head caps;    /* HEAD struct perf_pmu_caps -> list */
-	struct list_head list;    /* ELEM */
+	/**
+	 * @caps: Holds the contents of files read from
+	 * <sysfs>/bus/event_source/devices/<name>/caps/.
+	 *
+	 * The contents are pairs of the filename with the value of its
+	 * contents, for example, max_precise (see above) may have a value of 3.
+	 */
+	struct list_head caps;
+	/** @list: Element on pmus list in pmu.c. */
+	struct list_head list;
+	/** @hybrid_list: Element on perf_pmu__hybrid_pmus. */
 	struct list_head hybrid_list;
 
+	/**
+	 * @missing_features: Features to inhibit when events on this PMU are
+	 * opened.
+	 */
 	struct {
+		/**
+		 * @exclude_guest: Disables perf_event_attr exclude_guest and
+		 * exclude_host.
+		 */
 		bool exclude_guest;
 	} missing_features;
 };
 
+/** @perf_pmu__fake: A special global PMU used for testing. */
 extern struct perf_pmu perf_pmu__fake;
 
 struct perf_pmu_info {
@@ -71,21 +142,60 @@ struct perf_pmu_info {
 
 #define UNIT_MAX_LEN	31 /* max length for event unit name */
 
+/**
+ * struct perf_pmu_alias - An event either read from sysfs or builtin in
+ * pmu-events.c, created by parsing the pmu-events json files.
+ */
 struct perf_pmu_alias {
+	/** @name: Name of the event like "mem-loads". */
 	char *name;
+	/** @desc: Optional short description of the event. */
 	char *desc;
+	/** @long_desc: Optional long description. */
 	char *long_desc;
+	/**
+	 * @topic: Optional topic such as cache or pipeline, particularly for
+	 * json events.
+	 */
 	char *topic;
+	/**
+	 * @str: Comma separated parameter list like
+	 * "event=0xcd,umask=0x1,ldlat=0x3".
+	 */
 	char *str;
-	struct list_head terms; /* HEAD struct parse_events_term -> list */
-	struct list_head list;  /* ELEM */
+	/** @terms: Owned list of the original parsed parameters. */
+	struct list_head terms;
+	/** @list: List element of struct perf_pmu aliases. */
+	struct list_head list;
+	/** @unit: Units for the event, such as bytes or cache lines. */
 	char unit[UNIT_MAX_LEN+1];
+	/** @scale: Value to scale read counter values by. */
 	double scale;
+	/**
+	 * @per_pkg: Does the file
+	 * <sysfs>/bus/event_source/devices/<pmu_name>/events/<name>.per-pkg or
+	 * equivalent json value exist and have the value 1.
+	 */
 	bool per_pkg;
+	/**
+	 * @snapshot: Does the file
+	 * <sysfs>/bus/event_source/devices/<pmu_name>/events/<name>.snapshot
+	 * exist and have the value 1.
+	 */
 	bool snapshot;
+	/**
+	 * @deprecated: Is the event hidden and so not shown in perf list by
+	 * default.
+	 */
 	bool deprecated;
+	/**
+	 * @metric_expr: A metric expression associated with an event. Doing
+	 * this makes little sense due to scale and unit applying to both.
+	 */
 	char *metric_expr;
+	/** @metric_name: A name for the metric. unit applying to both. */
 	char *metric_name;
+	/** @pmu_name: The name copied from struct perf_pmu. */
 	char *pmu_name;
 };
 
@@ -116,9 +226,7 @@ void perf_pmu__del_formats(struct list_head *formats);
 struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);
 
 bool is_pmu_core(const char *name);
-void print_pmu_events(const char *event_glob, bool name_only, bool quiet,
-		      bool long_desc, bool details_flag,
-		      bool deprecated, const char *pmu_name);
+void print_pmu_events(const struct print_callbacks *print_cb, void *print_state);
 bool pmu_have_event(const char *pname, const char *name);
 
 int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt, ...) __scanf(3, 4);
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
new file mode 100644
index 000000000000..7f3b93c4d229
--- /dev/null
+++ b/tools/perf/util/pmus.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/list.h>
+#include <pmus.h>
+
+LIST_HEAD(pmus);
diff --git a/tools/perf/util/pmus.h b/tools/perf/util/pmus.h
new file mode 100644
index 000000000000..5ec12007eb5c
--- /dev/null
+++ b/tools/perf/util/pmus.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PMUS_H
+#define __PMUS_H
+
+extern struct list_head pmus;
+
+#define perf_pmus__for_each_pmu(pmu) list_for_each_entry(pmu, &pmus, list)
+
+#endif /* __PMUS_H */
diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index c4d5d87fae2f..2646ae18d9f9 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -28,6 +28,7 @@
 
 #define MAX_NAME_LEN 100
 
+/** Strings corresponding to enum perf_type_id. */
 static const char * const event_type_descriptors[] = {
 	"Hardware event",
 	"Software event",
@@ -52,125 +53,77 @@ static const struct event_symbol event_symbols_tool[PERF_TOOL_MAX] = {
 	},
 };
 
-static int cmp_string(const void *a, const void *b)
-{
-	const char * const *as = a;
-	const char * const *bs = b;
-
-	return strcmp(*as, *bs);
-}
-
 /*
  * Print the events from <debugfs_mount_point>/tracing/events
  */
-void print_tracepoint_events(const char *subsys_glob,
-			     const char *event_glob, bool name_only)
+void print_tracepoint_events(const struct print_callbacks *print_cb, void *print_state)
 {
-	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_dirent, *evt_dirent;
-	char evt_path[MAXPATHLEN];
-	char *dir_path;
-	char **evt_list = NULL;
-	unsigned int evt_i = 0, evt_num = 0;
-	bool evt_num_known = false;
-
-restart:
-	sys_dir = tracing_events__opendir();
-	if (!sys_dir)
-		return;
-
-	if (evt_num_known) {
-		evt_list = zalloc(sizeof(char *) * evt_num);
-		if (!evt_list)
-			goto out_close_sys_dir;
-	}
-
-	for_each_subsystem(sys_dir, sys_dirent) {
-		if (subsys_glob != NULL &&
-		    !strglobmatch(sys_dirent->d_name, subsys_glob))
+	struct dirent **sys_namelist = NULL;
+	int sys_items = tracing_events__scandir_alphasort(&sys_namelist);
+
+	for (int i = 0; i < sys_items; i++) {
+		struct dirent *sys_dirent = sys_namelist[i];
+		struct dirent **evt_namelist = NULL;
+		char *dir_path;
+		int evt_items;
+
+		if (sys_dirent->d_type != DT_DIR ||
+		    !strcmp(sys_dirent->d_name, ".") ||
+		    !strcmp(sys_dirent->d_name, ".."))
 			continue;
 
 		dir_path = get_events_file(sys_dirent->d_name);
 		if (!dir_path)
 			continue;
-		evt_dir = opendir(dir_path);
-		if (!evt_dir)
-			goto next;
 
-		for_each_event(dir_path, evt_dir, evt_dirent) {
-			if (event_glob != NULL &&
-			    !strglobmatch(evt_dirent->d_name, event_glob))
+		evt_items = scandir(dir_path, &evt_namelist, NULL, alphasort);
+		for (int j = 0; j < evt_items; j++) {
+			struct dirent *evt_dirent = evt_namelist[j];
+			char evt_path[MAXPATHLEN];
+
+			if (evt_dirent->d_type != DT_DIR ||
+			    !strcmp(evt_dirent->d_name, ".") ||
+			    !strcmp(evt_dirent->d_name, ".."))
 				continue;
 
-			if (!evt_num_known) {
-				evt_num++;
+			if (tp_event_has_id(dir_path, evt_dirent) != 0)
 				continue;
-			}
 
 			snprintf(evt_path, MAXPATHLEN, "%s:%s",
 				 sys_dirent->d_name, evt_dirent->d_name);
-
-			evt_list[evt_i] = strdup(evt_path);
-			if (evt_list[evt_i] == NULL) {
-				put_events_file(dir_path);
-				goto out_close_evt_dir;
-			}
-			evt_i++;
-		}
-		closedir(evt_dir);
-next:
-		put_events_file(dir_path);
-	}
-	closedir(sys_dir);
-
-	if (!evt_num_known) {
-		evt_num_known = true;
-		goto restart;
-	}
-	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
-	evt_i = 0;
-	while (evt_i < evt_num) {
-		if (name_only) {
-			printf("%s ", evt_list[evt_i++]);
-			continue;
+			print_cb->print_event(print_state,
+					/*topic=*/NULL,
+					/*pmu_name=*/NULL,
+					evt_path,
+					/*event_alias=*/NULL,
+					/*scale_unit=*/NULL,
+					/*deprecated=*/false,
+					"Tracepoint event",
+					/*desc=*/NULL,
+					/*long_desc=*/NULL,
+					/*encoding_desc=*/NULL,
+					/*metric_name=*/NULL,
+					/*metric_expr=*/NULL);
 		}
-		printf("  %-50s [%s]\n", evt_list[evt_i++],
-				event_type_descriptors[PERF_TYPE_TRACEPOINT]);
+		free(dir_path);
+		free(evt_namelist);
 	}
-	if (evt_num && pager_in_use())
-		printf("\n");
-
-out_free:
-	evt_num = evt_i;
-	for (evt_i = 0; evt_i < evt_num; evt_i++)
-		zfree(&evt_list[evt_i]);
-	zfree(&evt_list);
-	return;
-
-out_close_evt_dir:
-	closedir(evt_dir);
-out_close_sys_dir:
-	closedir(sys_dir);
-
-	printf("FATAL: not enough memory to print %s\n",
-			event_type_descriptors[PERF_TYPE_TRACEPOINT]);
-	if (evt_list)
-		goto out_free;
+	free(sys_namelist);
 }
 
-void print_sdt_events(const char *subsys_glob, const char *event_glob,
-		      bool name_only)
+void print_sdt_events(const struct print_callbacks *print_cb, void *print_state)
 {
-	struct probe_cache *pcache;
-	struct probe_cache_entry *ent;
 	struct strlist *bidlist, *sdtlist;
-	struct strlist_config cfg = {.dont_dupstr = true};
-	struct str_node *nd, *nd2;
-	char *buf, *path, *ptr = NULL;
-	bool show_detail = false;
-	int ret;
-
-	sdtlist = strlist__new(NULL, &cfg);
+	struct str_node *bid_nd, *sdt_name, *next_sdt_name;
+	const char *last_sdt_name = NULL;
+
+	/*
+	 * The implicitly sorted sdtlist will hold the tracepoint name followed
+	 * by @<buildid>. If the tracepoint name is unique (determined by
+	 * looking at the adjacent nodes) the @<buildid> is dropped otherwise
+	 * the executable path and buildid are added to the name.
+	 */
+	sdtlist = strlist__new(NULL, NULL);
 	if (!sdtlist) {
 		pr_debug("Failed to allocate new strlist for SDT\n");
 		return;
@@ -180,354 +133,274 @@ void print_sdt_events(const char *subsys_glob, const char *event_glob,
 		pr_debug("Failed to get buildids: %d\n", errno);
 		return;
 	}
-	strlist__for_each_entry(nd, bidlist) {
-		pcache = probe_cache__new(nd->s, NULL);
+	strlist__for_each_entry(bid_nd, bidlist) {
+		struct probe_cache *pcache;
+		struct probe_cache_entry *ent;
+
+		pcache = probe_cache__new(bid_nd->s, NULL);
 		if (!pcache)
 			continue;
 		list_for_each_entry(ent, &pcache->entries, node) {
-			if (!ent->sdt)
-				continue;
-			if (subsys_glob &&
-			    !strglobmatch(ent->pev.group, subsys_glob))
-				continue;
-			if (event_glob &&
-			    !strglobmatch(ent->pev.event, event_glob))
-				continue;
-			ret = asprintf(&buf, "%s:%s@%s", ent->pev.group,
-					ent->pev.event, nd->s);
-			if (ret > 0)
-				strlist__add(sdtlist, buf);
+			char buf[1024];
+
+			snprintf(buf, sizeof(buf), "%s:%s@%s",
+				 ent->pev.group, ent->pev.event, bid_nd->s);
+			strlist__add(sdtlist, buf);
 		}
 		probe_cache__delete(pcache);
 	}
 	strlist__delete(bidlist);
 
-	strlist__for_each_entry(nd, sdtlist) {
-		buf = strchr(nd->s, '@');
-		if (buf)
-			*(buf++) = '\0';
-		if (name_only) {
-			printf("%s ", nd->s);
-			continue;
-		}
-		nd2 = strlist__next(nd);
-		if (nd2) {
-			ptr = strchr(nd2->s, '@');
-			if (ptr)
-				*ptr = '\0';
-			if (strcmp(nd->s, nd2->s) == 0)
-				show_detail = true;
+	strlist__for_each_entry(sdt_name, sdtlist) {
+		bool show_detail = false;
+		char *bid = strchr(sdt_name->s, '@');
+		char *evt_name = NULL;
+
+		if (bid)
+			*(bid++) = '\0';
+
+		if (last_sdt_name && !strcmp(last_sdt_name, sdt_name->s)) {
+			show_detail = true;
+		} else {
+			next_sdt_name = strlist__next(sdt_name);
+			if (next_sdt_name) {
+				char *bid2 = strchr(next_sdt_name->s, '@');
+
+				if (bid2)
+					*bid2 = '\0';
+				if (strcmp(sdt_name->s, next_sdt_name->s) == 0)
+					show_detail = true;
+				if (bid2)
+					*bid2 = '@';
+			}
 		}
+		last_sdt_name = sdt_name->s;
+
 		if (show_detail) {
-			path = build_id_cache__origname(buf);
-			ret = asprintf(&buf, "%s@%s(%.12s)", nd->s, path, buf);
-			if (ret > 0) {
-				printf("  %-50s [%s]\n", buf, "SDT event");
-				free(buf);
+			char *path = build_id_cache__origname(bid);
+
+			if (path) {
+				if (asprintf(&evt_name, "%s@%s(%.12s)", sdt_name->s, path, bid) < 0)
+					evt_name = NULL;
+				free(path);
 			}
-			free(path);
-		} else
-			printf("  %-50s [%s]\n", nd->s, "SDT event");
-		if (nd2) {
-			if (strcmp(nd->s, nd2->s) != 0)
-				show_detail = false;
-			if (ptr)
-				*ptr = '@';
 		}
+		print_cb->print_event(print_state,
+				/*topic=*/NULL,
+				/*pmu_name=*/NULL,
+				evt_name ?: sdt_name->s,
+				/*event_alias=*/NULL,
+				/*deprecated=*/false,
+				/*scale_unit=*/NULL,
+				"SDT event",
+				/*desc=*/NULL,
+				/*long_desc=*/NULL,
+				/*encoding_desc=*/NULL,
+				/*metric_name=*/NULL,
+				/*metric_expr=*/NULL);
+
+		free(evt_name);
 	}
 	strlist__delete(sdtlist);
 }
 
-int print_hwcache_events(const char *event_glob, bool name_only)
+int print_hwcache_events(const struct print_callbacks *print_cb, void *print_state)
 {
-	unsigned int type, op, i, evt_i = 0, evt_num = 0, npmus = 0;
-	char name[64], new_name[128];
-	char **evt_list = NULL, **evt_pmus = NULL;
-	bool evt_num_known = false;
-	struct perf_pmu *pmu = NULL;
-
-	if (perf_pmu__has_hybrid()) {
-		npmus = perf_pmu__hybrid_pmu_num();
-		evt_pmus = zalloc(sizeof(char *) * npmus);
-		if (!evt_pmus)
-			goto out_enomem;
-	}
+	struct strlist *evt_name_list = strlist__new(NULL, NULL);
+	struct str_node *nd;
 
-restart:
-	if (evt_num_known) {
-		evt_list = zalloc(sizeof(char *) * evt_num);
-		if (!evt_list)
-			goto out_enomem;
+	if (!evt_name_list) {
+		pr_debug("Failed to allocate new strlist for hwcache events\n");
+		return -ENOMEM;
 	}
-
-	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
-		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
+	for (int type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
+		for (int op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
 			/* skip invalid cache type */
 			if (!evsel__is_cache_op_valid(type, op))
 				continue;
 
-			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
-				unsigned int hybrid_supported = 0, j;
-				bool supported;
+			for (int i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
+				struct perf_pmu *pmu = NULL;
+				char name[64];
 
 				__evsel__hw_cache_type_op_res_name(type, op, i, name, sizeof(name));
-				if (event_glob != NULL && !strglobmatch(name, event_glob))
-					continue;
-
 				if (!perf_pmu__has_hybrid()) {
-					if (!is_event_supported(PERF_TYPE_HW_CACHE,
-								type | (op << 8) | (i << 16))) {
-						continue;
-					}
-				} else {
-					perf_pmu__for_each_hybrid_pmu(pmu) {
-						if (!evt_num_known) {
-							evt_num++;
-							continue;
-						}
-
-						supported = is_event_supported(
-							PERF_TYPE_HW_CACHE,
-							type | (op << 8) | (i << 16) |
-							((__u64)pmu->type << PERF_PMU_TYPE_SHIFT));
-						if (supported) {
-							snprintf(new_name, sizeof(new_name),
-								 "%s/%s/", pmu->name, name);
-							evt_pmus[hybrid_supported] =
-								strdup(new_name);
-							hybrid_supported++;
-						}
-					}
-
-					if (hybrid_supported == 0)
-						continue;
-				}
-
-				if (!evt_num_known) {
-					evt_num++;
+					if (is_event_supported(PERF_TYPE_HW_CACHE,
+							       type | (op << 8) | (i << 16)))
+						strlist__add(evt_name_list, name);
 					continue;
 				}
-
-				if ((hybrid_supported == 0) ||
-				    (hybrid_supported == npmus)) {
-					evt_list[evt_i] = strdup(name);
-					if (npmus > 0) {
-						for (j = 0; j < npmus; j++)
-							zfree(&evt_pmus[j]);
-					}
-				} else {
-					for (j = 0; j < hybrid_supported; j++) {
-						evt_list[evt_i++] = evt_pmus[j];
-						evt_pmus[j] = NULL;
+				perf_pmu__for_each_hybrid_pmu(pmu) {
+					if (is_event_supported(PERF_TYPE_HW_CACHE,
+					    type | (op << 8) | (i << 16) |
+					    ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT))) {
+						char new_name[128];
+							snprintf(new_name, sizeof(new_name),
+								 "%s/%s/", pmu->name, name);
+							strlist__add(evt_name_list, new_name);
 					}
-					continue;
 				}
-
-				if (evt_list[evt_i] == NULL)
-					goto out_enomem;
-				evt_i++;
 			}
 		}
 	}
 
-	if (!evt_num_known) {
-		evt_num_known = true;
-		goto restart;
-	}
-
-	for (evt_i = 0; evt_i < evt_num; evt_i++) {
-		if (!evt_list[evt_i])
-			break;
+	strlist__for_each_entry(nd, evt_name_list) {
+		print_cb->print_event(print_state,
+				"cache",
+				/*pmu_name=*/NULL,
+				nd->s,
+				/*event_alias=*/NULL,
+				/*scale_unit=*/NULL,
+				/*deprecated=*/false,
+				event_type_descriptors[PERF_TYPE_HW_CACHE],
+				/*desc=*/NULL,
+				/*long_desc=*/NULL,
+				/*encoding_desc=*/NULL,
+				/*metric_name=*/NULL,
+				/*metric_expr=*/NULL);
 	}
-
-	evt_num = evt_i;
-	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
-	evt_i = 0;
-	while (evt_i < evt_num) {
-		if (name_only) {
-			printf("%s ", evt_list[evt_i++]);
-			continue;
-		}
-		printf("  %-50s [%s]\n", evt_list[evt_i++],
-				event_type_descriptors[PERF_TYPE_HW_CACHE]);
-	}
-	if (evt_num && pager_in_use())
-		printf("\n");
-
-out_free:
-	evt_num = evt_i;
-	for (evt_i = 0; evt_i < evt_num; evt_i++)
-		zfree(&evt_list[evt_i]);
-	zfree(&evt_list);
-
-	for (evt_i = 0; evt_i < npmus; evt_i++)
-		zfree(&evt_pmus[evt_i]);
-	zfree(&evt_pmus);
-	return evt_num;
-
-out_enomem:
-	printf("FATAL: not enough memory to print %s\n",
-		event_type_descriptors[PERF_TYPE_HW_CACHE]);
-	if (evt_list)
-		goto out_free;
-	return evt_num;
+	strlist__delete(evt_name_list);
+	return 0;
 }
 
-static void print_tool_event(const struct event_symbol *syms, const char *event_glob,
-			     bool name_only)
+void print_tool_events(const struct print_callbacks *print_cb, void *print_state)
 {
-	if (syms->symbol == NULL)
-		return;
-
-	if (event_glob && !(strglobmatch(syms->symbol, event_glob) ||
-	      (syms->alias && strglobmatch(syms->alias, event_glob))))
-		return;
-
-	if (name_only)
-		printf("%s ", syms->symbol);
-	else {
-		char name[MAX_NAME_LEN];
-
-		if (syms->alias && strlen(syms->alias))
-			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
-		else
-			strlcpy(name, syms->symbol, MAX_NAME_LEN);
-		printf("  %-50s [%s]\n", name, "Tool event");
+	// Start at 1 because the first enum entry means no tool event.
+	for (int i = 1; i < PERF_TOOL_MAX; ++i) {
+		print_cb->print_event(print_state,
+				"tool",
+				/*pmu_name=*/NULL,
+				event_symbols_tool[i].symbol,
+				event_symbols_tool[i].alias,
+				/*scale_unit=*/NULL,
+				/*deprecated=*/false,
+				"Tool event",
+				/*desc=*/NULL,
+				/*long_desc=*/NULL,
+				/*encoding_desc=*/NULL,
+				/*metric_name=*/NULL,
+				/*metric_expr=*/NULL);
 	}
 }
 
-void print_tool_events(const char *event_glob, bool name_only)
+void print_symbol_events(const struct print_callbacks *print_cb, void *print_state,
+			 unsigned int type, const struct event_symbol *syms,
+			 unsigned int max)
 {
-	// Start at 1 because the first enum entry means no tool event.
-	for (int i = 1; i < PERF_TOOL_MAX; ++i)
-		print_tool_event(event_symbols_tool + i, event_glob, name_only);
-
-	if (pager_in_use())
-		printf("\n");
-}
+	struct strlist *evt_name_list = strlist__new(NULL, NULL);
+	struct str_node *nd;
 
-void print_symbol_events(const char *event_glob, unsigned int type,
-			 struct event_symbol *syms, unsigned int max,
-			 bool name_only)
-{
-	unsigned int i, evt_i = 0, evt_num = 0;
-	char name[MAX_NAME_LEN];
-	char **evt_list = NULL;
-	bool evt_num_known = false;
-
-restart:
-	if (evt_num_known) {
-		evt_list = zalloc(sizeof(char *) * evt_num);
-		if (!evt_list)
-			goto out_enomem;
-		syms -= max;
+	if (!evt_name_list) {
+		pr_debug("Failed to allocate new strlist for symbol events\n");
+		return;
 	}
-
-	for (i = 0; i < max; i++, syms++) {
+	for (unsigned int i = 0; i < max; i++) {
 		/*
 		 * New attr.config still not supported here, the latest
 		 * example was PERF_COUNT_SW_CGROUP_SWITCHES
 		 */
-		if (syms->symbol == NULL)
-			continue;
-
-		if (event_glob != NULL && !(strglobmatch(syms->symbol, event_glob) ||
-		      (syms->alias && strglobmatch(syms->alias, event_glob))))
+		if (syms[i].symbol == NULL)
 			continue;
 
 		if (!is_event_supported(type, i))
 			continue;
 
-		if (!evt_num_known) {
-			evt_num++;
-			continue;
-		}
-
-		if (!name_only && strlen(syms->alias))
-			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
-		else
-			strlcpy(name, syms->symbol, MAX_NAME_LEN);
+		if (strlen(syms[i].alias)) {
+			char name[MAX_NAME_LEN];
 
-		evt_list[evt_i] = strdup(name);
-		if (evt_list[evt_i] == NULL)
-			goto out_enomem;
-		evt_i++;
+			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms[i].symbol, syms[i].alias);
+			strlist__add(evt_name_list, name);
+		} else
+			strlist__add(evt_name_list, syms[i].symbol);
 	}
 
-	if (!evt_num_known) {
-		evt_num_known = true;
-		goto restart;
-	}
-	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
-	evt_i = 0;
-	while (evt_i < evt_num) {
-		if (name_only) {
-			printf("%s ", evt_list[evt_i++]);
-			continue;
+	strlist__for_each_entry(nd, evt_name_list) {
+		char *alias = strstr(nd->s, " OR ");
+
+		if (alias) {
+			*alias = '\0';
+			alias += 4;
 		}
-		printf("  %-50s [%s]\n", evt_list[evt_i++], event_type_descriptors[type]);
+		print_cb->print_event(print_state,
+				/*topic=*/NULL,
+				/*pmu_name=*/NULL,
+				nd->s,
+				alias,
+				/*scale_unit=*/NULL,
+				/*deprecated=*/false,
+				event_type_descriptors[type],
+				/*desc=*/NULL,
+				/*long_desc=*/NULL,
+				/*encoding_desc=*/NULL,
+				/*metric_name=*/NULL,
+				/*metric_expr=*/NULL);
 	}
-	if (evt_num && pager_in_use())
-		printf("\n");
-
-out_free:
-	evt_num = evt_i;
-	for (evt_i = 0; evt_i < evt_num; evt_i++)
-		zfree(&evt_list[evt_i]);
-	zfree(&evt_list);
-	return;
-
-out_enomem:
-	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[type]);
-	if (evt_list)
-		goto out_free;
+	strlist__delete(evt_name_list);
 }
 
 /*
  * Print the help text for the event symbols:
  */
-void print_events(const char *event_glob, bool name_only, bool quiet_flag,
-			bool long_desc, bool details_flag, bool deprecated,
-			const char *pmu_name)
+void print_events(const struct print_callbacks *print_cb, void *print_state)
 {
-	print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
-			    event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
-
-	print_symbol_events(event_glob, PERF_TYPE_SOFTWARE,
-			    event_symbols_sw, PERF_COUNT_SW_MAX, name_only);
-	print_tool_events(event_glob, name_only);
-
-	print_hwcache_events(event_glob, name_only);
-
-	print_pmu_events(event_glob, name_only, quiet_flag, long_desc,
-			details_flag, deprecated, pmu_name);
-
-	if (event_glob != NULL)
-		return;
-
-	if (!name_only) {
-		printf("  %-50s [%s]\n",
-		       "rNNN",
-		       event_type_descriptors[PERF_TYPE_RAW]);
-		printf("  %-50s [%s]\n",
-		       "cpu/t1=v1[,t2=v2,t3 ...]/modifier",
-		       event_type_descriptors[PERF_TYPE_RAW]);
-		if (pager_in_use())
-			printf("   (see 'man perf-list' on how to encode it)\n\n");
-
-		printf("  %-50s [%s]\n",
-		       "mem:<addr>[/len][:access]",
-			event_type_descriptors[PERF_TYPE_BREAKPOINT]);
-		if (pager_in_use())
-			printf("\n");
-	}
-
-	print_tracepoint_events(NULL, NULL, name_only);
-
-	print_sdt_events(NULL, NULL, name_only);
-
-	metricgroup__print(true, true, NULL, name_only, details_flag,
-			   pmu_name);
-
-	print_libpfm_events(name_only, long_desc);
+	print_symbol_events(print_cb, print_state, PERF_TYPE_HARDWARE,
+			event_symbols_hw, PERF_COUNT_HW_MAX);
+	print_symbol_events(print_cb, print_state, PERF_TYPE_SOFTWARE,
+			event_symbols_sw, PERF_COUNT_SW_MAX);
+
+	print_tool_events(print_cb, print_state);
+
+	print_hwcache_events(print_cb, print_state);
+
+	print_pmu_events(print_cb, print_state);
+
+	print_cb->print_event(print_state,
+			/*topic=*/NULL,
+			/*pmu_name=*/NULL,
+			"rNNN",
+			/*event_alias=*/NULL,
+			/*scale_unit=*/NULL,
+			/*deprecated=*/false,
+			event_type_descriptors[PERF_TYPE_RAW],
+			/*desc=*/NULL,
+			/*long_desc=*/NULL,
+			/*encoding_desc=*/NULL,
+			/*metric_name=*/NULL,
+			/*metric_expr=*/NULL);
+
+	print_cb->print_event(print_state,
+			/*topic=*/NULL,
+			/*pmu_name=*/NULL,
+			"cpu/t1=v1[,t2=v2,t3 ...]/modifier",
+			/*event_alias=*/NULL,
+			/*scale_unit=*/NULL,
+			/*deprecated=*/false,
+			event_type_descriptors[PERF_TYPE_RAW],
+			"(see 'man perf-list' on how to encode it)",
+			/*long_desc=*/NULL,
+			/*encoding_desc=*/NULL,
+			/*metric_name=*/NULL,
+			/*metric_expr=*/NULL);
+
+	print_cb->print_event(print_state,
+			/*topic=*/NULL,
+			/*pmu_name=*/NULL,
+			"mem:<addr>[/len][:access]",
+			/*scale_unit=*/NULL,
+			/*event_alias=*/NULL,
+			/*deprecated=*/false,
+			event_type_descriptors[PERF_TYPE_BREAKPOINT],
+			/*desc=*/NULL,
+			/*long_desc=*/NULL,
+			/*encoding_desc=*/NULL,
+			/*metric_name=*/NULL,
+			/*metric_expr=*/NULL);
+
+	print_tracepoint_events(print_cb, print_state);
+
+	print_sdt_events(print_cb, print_state);
+
+	metricgroup__print(print_cb, print_state);
+
+	print_libpfm_events(print_cb, print_state);
 }
diff --git a/tools/perf/util/print-events.h b/tools/perf/util/print-events.h
index 1da9910d83a6..c237e53c4487 100644
--- a/tools/perf/util/print-events.h
+++ b/tools/perf/util/print-events.h
@@ -2,21 +2,39 @@
 #ifndef __PERF_PRINT_EVENTS_H
 #define __PERF_PRINT_EVENTS_H
 
+#include <linux/perf_event.h>
 #include <stdbool.h>
 
 struct event_symbol;
 
-void print_events(const char *event_glob, bool name_only, bool quiet_flag,
-		  bool long_desc, bool details_flag, bool deprecated,
-		  const char *pmu_name);
-int print_hwcache_events(const char *event_glob, bool name_only);
-void print_sdt_events(const char *subsys_glob, const char *event_glob,
-		      bool name_only);
-void print_symbol_events(const char *event_glob, unsigned int type,
-			 struct event_symbol *syms, unsigned int max,
-			 bool name_only);
-void print_tool_events(const char *event_glob, bool name_only);
-void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
-			     bool name_only);
+struct print_callbacks {
+	void (*print_start)(void *print_state);
+	void (*print_end)(void *print_state);
+	void (*print_event)(void *print_state, const char *topic,
+			const char *pmu_name,
+			const char *event_name, const char *event_alias,
+			const char *scale_unit,
+			bool deprecated, const char *event_type_desc,
+			const char *desc, const char *long_desc,
+			const char *encoding_desc,
+			const char *metric_name, const char *metric_expr);
+	void (*print_metric)(void *print_state,
+			const char *group,
+			const char *name,
+			const char *desc,
+			const char *long_desc,
+			const char *expr,
+			const char *unit);
+};
+
+/** Print all events, the default when no options are specified. */
+void print_events(const struct print_callbacks *print_cb, void *print_state);
+int print_hwcache_events(const struct print_callbacks *print_cb, void *print_state);
+void print_sdt_events(const struct print_callbacks *print_cb, void *print_state);
+void print_symbol_events(const struct print_callbacks *print_cb, void *print_state,
+			 unsigned int type, const struct event_symbol *syms,
+			 unsigned int max);
+void print_tool_events(const struct print_callbacks *print_cb, void *print_state);
+void print_tracepoint_events(const struct print_callbacks *print_cb, void *print_state);
 
 #endif /* __PERF_PRINT_EVENTS_H */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 50d861a80f57..54b49ce85c9f 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -763,7 +763,7 @@ static int find_best_scope_cb(Dwarf_Die *fn_die, void *data)
 
 	/* Skip if declared file name does not match */
 	if (fsp->file) {
-		file = dwarf_decl_file(fn_die);
+		file = die_get_decl_file(fn_die);
 		if (!file || strcmp(fsp->file, file) != 0)
 			return 0;
 	}
@@ -1063,6 +1063,7 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 	struct dwarf_callback_param *param = data;
 	struct probe_finder *pf = param->data;
 	struct perf_probe_point *pp = &pf->pev->point;
+	const char *fname;
 
 	/* Check tag and diename */
 	if (!die_is_func_def(sp_die) ||
@@ -1070,12 +1071,17 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 		return DWARF_CB_OK;
 
 	/* Check declared file */
-	if (pp->file && strtailcmp(pp->file, dwarf_decl_file(sp_die)))
+	fname = die_get_decl_file(sp_die);
+	if (!fname) {
+		pr_warning("A function DIE doesn't have decl_line. Maybe broken DWARF?\n");
+		return DWARF_CB_OK;
+	}
+	if (pp->file && fname && strtailcmp(pp->file, fname))
 		return DWARF_CB_OK;
 
 	pr_debug("Matched function: %s [%lx]\n", dwarf_diename(sp_die),
 		 (unsigned long)dwarf_dieoffset(sp_die));
-	pf->fname = dwarf_decl_file(sp_die);
+	pf->fname = fname;
 	if (pp->line) { /* Function relative line */
 		dwarf_decl_line(sp_die, &pf->lno);
 		pf->lno += pp->line;
@@ -1134,6 +1140,7 @@ struct pubname_callback_param {
 static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data)
 {
 	struct pubname_callback_param *param = data;
+	const char *fname;
 
 	if (dwarf_offdie(dbg, gl->die_offset, param->sp_die)) {
 		if (dwarf_tag(param->sp_die) != DW_TAG_subprogram)
@@ -1143,9 +1150,11 @@ static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data)
 			if (!dwarf_offdie(dbg, gl->cu_offset, param->cu_die))
 				return DWARF_CB_OK;
 
-			if (param->file &&
-			    strtailcmp(param->file, dwarf_decl_file(param->sp_die)))
-				return DWARF_CB_OK;
+			if (param->file) {
+				fname = die_get_decl_file(param->sp_die);
+				if (!fname || strtailcmp(param->file, fname))
+					return DWARF_CB_OK;
+			}
 
 			param->found = 1;
 			return DWARF_CB_ABORT;
@@ -1741,7 +1750,7 @@ int debuginfo__find_probe_point(struct debuginfo *dbg, u64 addr,
 			goto post;
 		}
 
-		fname = dwarf_decl_file(&spdie);
+		fname = die_get_decl_file(&spdie);
 		if (addr == baseaddr) {
 			/* Function entry - Relative line number is 0 */
 			lineno = baseline;
@@ -1778,8 +1787,8 @@ int debuginfo__find_probe_point(struct debuginfo *dbg, u64 addr,
 			}
 		}
 		/* Verify the lineno and baseline are in a same file */
-		tmp = dwarf_decl_file(&spdie);
-		if (!tmp || strcmp(tmp, fname) != 0)
+		tmp = die_get_decl_file(&spdie);
+		if (!tmp || (fname && strcmp(tmp, fname) != 0))
 			lineno = 0;
 	}
 
@@ -1889,13 +1898,17 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 	struct dwarf_callback_param *param = data;
 	struct line_finder *lf = param->data;
 	struct line_range *lr = lf->lr;
+	const char *fname;
 
 	/* Check declared file */
-	if (lr->file && strtailcmp(lr->file, dwarf_decl_file(sp_die)))
-		return DWARF_CB_OK;
+	if (lr->file) {
+		fname = die_get_decl_file(sp_die);
+		if (!fname || strtailcmp(lr->file, fname))
+			return DWARF_CB_OK;
+	}
 
 	if (die_match_name(sp_die, lr->function) && die_is_func_def(sp_die)) {
-		lf->fname = dwarf_decl_file(sp_die);
+		lf->fname = die_get_decl_file(sp_die);
 		dwarf_decl_line(sp_die, &lr->offset);
 		pr_debug("fname: %s, lineno:%d\n", lf->fname, lr->offset);
 		lf->lno_s = lr->offset + lr->start;
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 5be5fa2391de..212031b97910 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -5,7 +5,9 @@
 #include <poll.h>
 #include <linux/err.h>
 #include <perf/cpumap.h>
+#ifdef HAVE_LIBTRACEEVENT
 #include <traceevent/event-parse.h>
+#endif
 #include <perf/mmap.h>
 #include "evlist.h"
 #include "callchain.h"
@@ -417,6 +419,7 @@ static PyObject *pyrf_sample_event__repr(struct pyrf_event *pevent)
 	return ret;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static bool is_tracepoint(struct pyrf_event *pevent)
 {
 	return pevent->evsel->core.attr.type == PERF_TYPE_TRACEPOINT;
@@ -439,8 +442,10 @@ tracepoint_field(struct pyrf_event *pe, struct tep_format_field *field)
 			offset  = val;
 			len     = offset >> 16;
 			offset &= 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 			if (field->flags & TEP_FIELD_IS_RELATIVE)
 				offset += field->offset + field->size;
+#endif
 		}
 		if (field->flags & TEP_FIELD_IS_STRING &&
 		    is_printable_array(data + offset, len)) {
@@ -486,14 +491,17 @@ get_tracepoint_field(struct pyrf_event *pevent, PyObject *attr_name)
 
 	return tracepoint_field(pevent, field);
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 static PyObject*
 pyrf_sample_event__getattro(struct pyrf_event *pevent, PyObject *attr_name)
 {
 	PyObject *obj = NULL;
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (is_tracepoint(pevent))
 		obj = get_tracepoint_field(pevent, attr_name);
+#endif
 
 	return obj ?: PyObject_GenericGetAttr((PyObject *) pevent, attr_name);
 }
@@ -718,17 +726,17 @@ static Py_ssize_t pyrf_thread_map__length(PyObject *obj)
 {
 	struct pyrf_thread_map *pthreads = (void *)obj;
 
-	return pthreads->threads->nr;
+	return perf_thread_map__nr(pthreads->threads);
 }
 
 static PyObject *pyrf_thread_map__item(PyObject *obj, Py_ssize_t i)
 {
 	struct pyrf_thread_map *pthreads = (void *)obj;
 
-	if (i >= pthreads->threads->nr)
+	if (i >= perf_thread_map__nr(pthreads->threads))
 		return NULL;
 
-	return Py_BuildValue("i", pthreads->threads->map[i]);
+	return Py_BuildValue("i", perf_thread_map__pid(pthreads->threads, i));
 }
 
 static PySequenceMethods pyrf_thread_map__sequence_methods = {
@@ -1134,14 +1142,6 @@ static PyObject *pyrf_evlist__open(struct pyrf_evlist *pevlist,
 				   PyObject *args, PyObject *kwargs)
 {
 	struct evlist *evlist = &pevlist->evlist;
-	int group = 0;
-	static char *kwlist[] = { "group", NULL };
-
-	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOii", kwlist, &group))
-		return NULL;
-
-	if (group)
-		evlist__set_leader(evlist);
 
 	if (evlist__open(evlist) < 0) {
 		PyErr_SetFromErrno(PyExc_OSError);
@@ -1326,6 +1326,9 @@ static struct {
 static PyObject *pyrf__tracepoint(struct pyrf_evsel *pevsel,
 				  PyObject *args, PyObject *kwargs)
 {
+#ifndef HAVE_LIBTRACEEVENT
+	return NULL;
+#else
 	struct tep_event *tp_format;
 	static char *kwlist[] = { "sys", "name", NULL };
 	char *sys  = NULL;
@@ -1340,6 +1343,7 @@ static PyObject *pyrf__tracepoint(struct pyrf_evsel *pevsel,
 		return _PyLong_FromLong(-1);
 
 	return _PyLong_FromLong(tp_format->id);
+#endif // HAVE_LIBTRACEEVENT
 }
 
 static PyMethodDef perf__methods[] = {
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 7b58f6c7c69d..9eb5c6a08999 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -99,13 +99,6 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call
 	bool use_comm_exec;
 	bool sample_id = opts->sample_id;
 
-	/*
-	 * Set the evsel leader links before we configure attributes,
-	 * since some might depend on this info.
-	 */
-	if (opts->group)
-		evlist__set_leader(evlist);
-
 	if (perf_cpu_map__cpu(evlist->core.user_requested_cpus, 0).cpu < 0)
 		opts->no_inherit = true;
 
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index 4269e916f450..46212bf020cf 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -13,7 +13,6 @@ struct option;
 
 struct record_opts {
 	struct target target;
-	bool	      group;
 	bool	      inherit_stat;
 	bool	      no_buffering;
 	bool	      no_inherit;
diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index f3fdad28a852..6fe478b0b61b 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c
@@ -163,6 +163,7 @@
 #include "s390-cpumsf-kernel.h"
 #include "s390-cpumcf-kernel.h"
 #include "config.h"
+#include "util/sample.h"
 
 struct s390_cpumsf {
 	struct auxtrace		auxtrace;
diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c
index 9a631d97471c..c10b891dbad6 100644
--- a/tools/perf/util/s390-sample-raw.c
+++ b/tools/perf/util/s390-sample-raw.c
@@ -28,6 +28,7 @@
 #include "sample-raw.h"
 #include "s390-cpumcf-kernel.h"
 #include "pmu-events/pmu-events.h"
+#include "util/sample.h"
 
 static size_t ctrset_size(struct cf_ctrset_entry *set)
 {
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
new file mode 100644
index 000000000000..60ec79d4eea4
--- /dev/null
+++ b/tools/perf/util/sample.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_SAMPLE_H
+#define __PERF_SAMPLE_H
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+/* number of register is bound by the number of bits in regs_dump::mask (64) */
+#define PERF_SAMPLE_REGS_CACHE_SIZE (8 * sizeof(u64))
+
+struct regs_dump {
+	u64 abi;
+	u64 mask;
+	u64 *regs;
+
+	/* Cached values/mask filled by first register access. */
+	u64 cache_regs[PERF_SAMPLE_REGS_CACHE_SIZE];
+	u64 cache_mask;
+};
+
+struct stack_dump {
+	u16 offset;
+	u64 size;
+	char *data;
+};
+
+struct sample_read_value {
+	u64 value;
+	u64 id;   /* only if PERF_FORMAT_ID */
+	u64 lost; /* only if PERF_FORMAT_LOST */
+};
+
+struct sample_read {
+	u64 time_enabled;
+	u64 time_running;
+	union {
+		struct {
+			u64 nr;
+			struct sample_read_value *values;
+		} group;
+		struct sample_read_value one;
+	};
+};
+
+static inline size_t sample_read_value_size(u64 read_format)
+{
+	/* PERF_FORMAT_ID is forced for PERF_SAMPLE_READ */
+	if (read_format & PERF_FORMAT_LOST)
+		return sizeof(struct sample_read_value);
+	else
+		return offsetof(struct sample_read_value, lost);
+}
+
+static inline struct sample_read_value *next_sample_read_value(struct sample_read_value *v, u64 read_format)
+{
+	return (void *)v + sample_read_value_size(read_format);
+}
+
+#define sample_read_group__for_each(v, nr, rf) \
+	for (int __i = 0; __i < (int)nr; v = next_sample_read_value(v, rf), __i++)
+
+#define MAX_INSN 16
+
+struct aux_sample {
+	u64 size;
+	void *data;
+};
+
+struct perf_sample {
+	u64 ip;
+	u32 pid, tid;
+	u64 time;
+	u64 addr;
+	u64 id;
+	u64 stream_id;
+	u64 period;
+	u64 weight;
+	u64 transaction;
+	u64 insn_cnt;
+	u64 cyc_cnt;
+	u32 cpu;
+	u32 raw_size;
+	u64 data_src;
+	u64 phys_addr;
+	u64 data_page_size;
+	u64 code_page_size;
+	u64 cgroup;
+	u32 flags;
+	u32 machine_pid;
+	u32 vcpu;
+	u16 insn_len;
+	u8  cpumode;
+	u16 misc;
+	u16 ins_lat;
+	u16 p_stage_cyc;
+	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
+	char insn[MAX_INSN];
+	void *raw_data;
+	struct ip_callchain *callchain;
+	struct branch_stack *branch_stack;
+	struct regs_dump  user_regs;
+	struct regs_dump  intr_regs;
+	struct stack_dump user_stack;
+	struct sample_read read;
+	struct aux_sample aux_sample;
+};
+
+/*
+ * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get
+ * 8-byte alignment.
+ */
+static inline void *perf_sample__synth_ptr(struct perf_sample *sample)
+{
+	return sample->raw_data - 4;
+}
+
+#endif /* __PERF_SAMPLE_H */
diff --git a/tools/perf/util/scripting-engines/Build b/tools/perf/util/scripting-engines/Build
index 0f5ba28339cf..d47820c0b4d4 100644
--- a/tools/perf/util/scripting-engines/Build
+++ b/tools/perf/util/scripting-engines/Build
@@ -1,5 +1,7 @@
-perf-$(CONFIG_LIBPERL)   += trace-event-perl.o
-perf-$(CONFIG_LIBPYTHON) += trace-event-python.o
+ifeq ($(CONFIG_LIBTRACEEVENT),y)
+  perf-$(CONFIG_LIBPERL)   += trace-event-perl.o
+  perf-$(CONFIG_LIBPYTHON) += trace-event-python.o
+endif
 
 CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-nested-externs -Wno-undef -Wno-switch-default -Wno-bad-function-cast -Wno-declaration-after-statement -Wno-switch-enum
 
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index a5d945415bbc..c097b7934fd4 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -27,6 +27,7 @@
 #include <errno.h>
 #include <linux/bitmap.h>
 #include <linux/time64.h>
+#include <traceevent/event-parse.h>
 
 #include <stdbool.h>
 /* perl needs the following define, right after including stdbool.h */
@@ -365,7 +366,7 @@ static void perl_process_tracepoint(struct perf_sample *sample,
 
 	sprintf(handler, "%s::%s", event->system, event->name);
 
-	if (!test_and_set_bit(event->id, events_defined))
+	if (!__test_and_set_bit(event->id, events_defined))
 		define_event_symbols(event, handler, event->print_fmt.args);
 
 	s = nsecs / NSEC_PER_SEC;
@@ -392,8 +393,10 @@ static void perl_process_tracepoint(struct perf_sample *sample,
 			if (field->flags & TEP_FIELD_IS_DYNAMIC) {
 				offset = *(int *)(data + field->offset);
 				offset &= 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 				if (field->flags & TEP_FIELD_IS_RELATIVE)
 					offset += field->offset + field->size;
+#endif
 			} else
 				offset = field->offset;
 			XPUSHs(sv_2mortal(newSVpv((char *)data + offset, 0)));
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 1f2040f36d4e..e930f5f1f36d 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -30,6 +30,7 @@
 #include <linux/bitmap.h>
 #include <linux/compiler.h>
 #include <linux/time64.h>
+#include <traceevent/event-parse.h>
 
 #include "../build-id.h"
 #include "../counts.h"
@@ -52,6 +53,7 @@
 #include "print_binary.h"
 #include "stat.h"
 #include "mem-events.h"
+#include "util/perf_regs.h"
 
 #if PY_MAJOR_VERSION < 3
 #define _PyUnicode_FromString(arg) \
@@ -933,7 +935,7 @@ static void python_process_tracepoint(struct perf_sample *sample,
 
 	sprintf(handler_name, "%s__%s", event->system, event->name);
 
-	if (!test_and_set_bit(event->id, events_defined))
+	if (!__test_and_set_bit(event->id, events_defined))
 		define_event_symbols(event, handler_name, event->print_fmt.args);
 
 	handler = get_handler(handler_name);
@@ -992,8 +994,10 @@ static void python_process_tracepoint(struct perf_sample *sample,
 				offset  = val;
 				len     = offset >> 16;
 				offset &= 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 				if (field->flags & TEP_FIELD_IS_RELATIVE)
 					offset += field->offset + field->size;
+#endif
 			}
 			if (field->flags & TEP_FIELD_IS_STRING &&
 			    is_printable_array(data + offset, len)) {
@@ -1653,13 +1657,7 @@ static void python_process_stat(struct perf_stat_config *config,
 	struct perf_cpu_map *cpus = counter->core.cpus;
 	int cpu, thread;
 
-	if (config->aggr_mode == AGGR_GLOBAL) {
-		process_stat(counter, (struct perf_cpu){ .cpu = -1 }, -1, tstamp,
-			     &counter->counts->aggr);
-		return;
-	}
-
-	for (thread = 0; thread < threads->nr; thread++) {
+	for (thread = 0; thread < perf_thread_map__nr(threads); thread++) {
 		for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) {
 			process_stat(counter, perf_cpu_map__cpu(cpus, cpu),
 				     perf_thread_map__pid(threads, thread), tstamp,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 1a4f10de29ff..7c021c6cedb9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
+#include <signal.h>
 #include <inttypes.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -313,7 +314,9 @@ void perf_session__delete(struct perf_session *session)
 			evlist__delete(session->evlist);
 		perf_data__close(session->data);
 	}
+#ifdef HAVE_LIBTRACEEVENT
 	trace_event__cleanup(&session->tevent);
+#endif
 	free(session);
 }
 
@@ -2022,7 +2025,7 @@ static int perf_session__flush_thread_stacks(struct perf_session *session)
 					 NULL);
 }
 
-volatile int session_done;
+volatile sig_atomic_t session_done;
 
 static int __perf_session__process_decomp_events(struct perf_session *session);
 
@@ -2748,7 +2751,7 @@ int perf_session__cpu_bitmap(struct perf_session *session,
 			goto out_delete_map;
 		}
 
-		set_bit(cpu.cpu, cpu_bitmap);
+		__set_bit(cpu.cpu, cpu_bitmap);
 	}
 
 	err = 0;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index be5871ea558f..ee3715e8563b 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -33,7 +33,9 @@ struct perf_session {
 	struct auxtrace		*auxtrace;
 	struct itrace_synth_opts *itrace_synth_opts;
 	struct list_head	auxtrace_index;
+#ifdef HAVE_LIBTRACEEVENT
 	struct trace_event	tevent;
+#endif
 	struct perf_record_time_conv	time_conv;
 	bool			repipe;
 	bool			one_mmap;
diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 5b1e6468d5e8..4f265d0222c4 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -7,7 +7,7 @@ cc_is_clang = b"clang version" in Popen([cc.split()[0], "-v"], stderr=PIPE).stde
 src_feature_tests  = getenv('srctree') + '/tools/build/feature'
 
 def clang_has_option(option):
-    cc_output = Popen([cc, option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines()
+    cc_output = Popen([cc.split()[0], str(cc.split()[1:]) + option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines()
     return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o))] == [ ]
 
 if cc_is_clang:
@@ -63,12 +63,18 @@ libperf = getenv('LIBPERF')
 ext_sources = [f.strip() for f in open('util/python-ext-sources')
 				if len(f.strip()) > 0 and f[0] != '#']
 
+extra_libraries = []
+
+if '-DHAVE_LIBTRACEEVENT' in cflags:
+    extra_libraries += [ 'traceevent' ]
+else:
+    ext_sources.remove('util/trace-event.c')
+
 # use full paths with source files
 ext_sources = list(map(lambda x: '%s/%s' % (src_perf, x) , ext_sources))
 
-extra_libraries = []
 if '-DHAVE_LIBNUMA_SUPPORT' in cflags:
-    extra_libraries = [ 'numa' ]
+    extra_libraries += [ 'numa' ]
 if '-DHAVE_LIBCAP_SUPPORT' in cflags:
     extra_libraries += [ 'cap' ]
 
@@ -77,7 +83,8 @@ perf = Extension('perf',
 		  include_dirs = ['util/include'],
 		  libraries = extra_libraries,
 		  extra_compile_args = cflags,
-		  extra_objects = [libtraceevent, libapikfs, libperf],
+		  extra_objects = [ x for x in [libtraceevent, libapikfs, libperf]
+                                    if x is not None],
                  )
 
 setup(name='perf',
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 2e7330867e2e..0ecc2cb13792 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -22,7 +22,6 @@
 #include "srcline.h"
 #include "strlist.h"
 #include "strbuf.h"
-#include <traceevent/event-parse.h>
 #include "mem-events.h"
 #include "annotate.h"
 #include "event.h"
@@ -32,6 +31,10 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 
+#ifdef HAVE_LIBTRACEEVENT
+#include <traceevent/event-parse.h>
+#endif
+
 regex_t		parent_regex;
 const char	default_parent_pattern[] = "^sys_|^do_page_fault";
 const char	*parent_pattern = default_parent_pattern;
@@ -743,6 +746,7 @@ struct sort_entry sort_time = {
 
 /* --sort trace */
 
+#ifdef HAVE_LIBTRACEEVENT
 static char *get_trace_output(struct hist_entry *he)
 {
 	struct trace_seq seq;
@@ -806,6 +810,7 @@ struct sort_entry sort_trace = {
 	.se_snprintf    = hist_entry__trace_snprintf,
 	.se_width_idx	= HISTC_TRACE,
 };
+#endif /* HAVE_LIBTRACEEVENT */
 
 /* sort keys for branch stacks */
 
@@ -2022,7 +2027,9 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_LOCAL_WEIGHT, "local_weight", sort_local_weight),
 	DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight),
 	DIM(SORT_TRANSACTION, "transaction", sort_transaction),
+#ifdef HAVE_LIBTRACEEVENT
 	DIM(SORT_TRACE, "trace", sort_trace),
+#endif
 	DIM(SORT_SYM_SIZE, "symbol_size", sort_sym_size),
 	DIM(SORT_DSO_SIZE, "dso_size", sort_dso_size),
 	DIM(SORT_CGROUP, "cgroup", sort_cgroup),
@@ -2206,7 +2213,14 @@ bool perf_hpp__is_ ## key ## _entry(struct perf_hpp_fmt *fmt)	\
 	return hse->se == &sort_ ## key ;			\
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 MK_SORT_ENTRY_CHK(trace)
+#else
+bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt __maybe_unused)
+{
+	return false;
+}
+#endif
 MK_SORT_ENTRY_CHK(srcline)
 MK_SORT_ENTRY_CHK(srcfile)
 MK_SORT_ENTRY_CHK(thread)
@@ -2347,6 +2361,17 @@ static int __sort_dimension__add_hpp_output(struct sort_dimension *sd,
 	return 0;
 }
 
+#ifndef HAVE_LIBTRACEEVENT
+bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt __maybe_unused)
+{
+	return false;
+}
+bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt __maybe_unused,
+				     struct hists *hists __maybe_unused)
+{
+	return false;
+}
+#else
 struct hpp_dynamic_entry {
 	struct perf_hpp_fmt hpp;
 	struct evsel *evsel;
@@ -2543,9 +2568,10 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 		tep_read_number_field(field, a->raw_data, &dyn);
 		offset = dyn & 0xffff;
 		size = (dyn >> 16) & 0xffff;
+#ifdef HAVE_LIBTRACEEVENT_TEP_FIELD_IS_RELATIVE
 		if (field->flags & TEP_FIELD_IS_RELATIVE)
 			offset += field->offset + field->size;
-
+#endif
 		/* record max width for output */
 		if (size > hde->dynamic_len)
 			hde->dynamic_len = size;
@@ -2621,6 +2647,7 @@ __alloc_dynamic_entry(struct evsel *evsel, struct tep_format_field *field,
 
 	return hde;
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
 {
@@ -2633,6 +2660,7 @@ struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
 		new_hse = memdup(hse, sizeof(*hse));
 		if (new_hse)
 			new_fmt = &new_hse->hpp;
+#ifdef HAVE_LIBTRACEEVENT
 	} else if (perf_hpp__is_dynamic_entry(fmt)) {
 		struct hpp_dynamic_entry *hde, *new_hde;
 
@@ -2640,6 +2668,7 @@ struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
 		new_hde = memdup(hde, sizeof(*hde));
 		if (new_hde)
 			new_fmt = &new_hde->hpp;
+#endif
 	} else {
 		new_fmt = memdup(fmt, sizeof(*fmt));
 	}
@@ -2719,6 +2748,7 @@ static struct evsel *find_evsel(struct evlist *evlist, char *event_name)
 	return evsel;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 static int __dynamic_dimension__add(struct evsel *evsel,
 				    struct tep_format_field *field,
 				    bool raw_trace, int level)
@@ -2789,13 +2819,13 @@ static int add_all_matching_fields(struct evlist *evlist,
 	}
 	return ret;
 }
+#endif /* HAVE_LIBTRACEEVENT */
 
 static int add_dynamic_entry(struct evlist *evlist, const char *tok,
 			     int level)
 {
 	char *str, *event_name, *field_name, *opt_name;
 	struct evsel *evsel;
-	struct tep_format_field *field;
 	bool raw_trace = symbol_conf.raw_trace;
 	int ret = 0;
 
@@ -2820,6 +2850,7 @@ static int add_dynamic_entry(struct evlist *evlist, const char *tok,
 		raw_trace = true;
 	}
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (!strcmp(field_name, "trace_fields")) {
 		ret = add_all_dynamic_fields(evlist, raw_trace, level);
 		goto out;
@@ -2829,6 +2860,7 @@ static int add_dynamic_entry(struct evlist *evlist, const char *tok,
 		ret = add_all_matching_fields(evlist, field_name, raw_trace, level);
 		goto out;
 	}
+#endif
 
 	evsel = find_evsel(evlist, event_name);
 	if (evsel == NULL) {
@@ -2843,10 +2875,12 @@ static int add_dynamic_entry(struct evlist *evlist, const char *tok,
 		goto out;
 	}
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (!strcmp(field_name, "*")) {
 		ret = add_evsel_fields(evsel, raw_trace, level);
 	} else {
-		field = tep_find_any_field(evsel->tp_format, field_name);
+		struct tep_format_field *field = tep_find_any_field(evsel->tp_format, field_name);
+
 		if (field == NULL) {
 			pr_debug("Cannot find event field for %s.%s\n",
 				 event_name, field_name);
@@ -2855,6 +2889,10 @@ static int add_dynamic_entry(struct evlist *evlist, const char *tok,
 
 		ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
 	}
+#else
+	(void)level;
+	(void)raw_trace;
+#endif /* HAVE_LIBTRACEEVENT */
 
 out:
 	free(str);
@@ -2955,11 +2993,11 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
 		struct sort_dimension *sd = &common_sort_dimensions[i];
 
-		if (strncasecmp(tok, sd->name, strlen(tok)))
+		if (!sd->name || strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) {
-			if (!strcmp(dynamic_headers[j], sd->name))
+			if (sd->name && !strcmp(dynamic_headers[j], sd->name))
 				sort_dimension_add_dynamic_header(sd);
 		}
 
@@ -3009,7 +3047,7 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
 		struct sort_dimension *sd = &bstack_sort_dimensions[i];
 
-		if (strncasecmp(tok, sd->name, strlen(tok)))
+		if (!sd->name || strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		if (sort__mode != SORT_MODE__BRANCH)
@@ -3025,7 +3063,7 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 	for (i = 0; i < ARRAY_SIZE(memory_sort_dimensions); i++) {
 		struct sort_dimension *sd = &memory_sort_dimensions[i];
 
-		if (strncasecmp(tok, sd->name, strlen(tok)))
+		if (!sd->name || strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		if (sort__mode != SORT_MODE__MEMORY)
@@ -3339,7 +3377,7 @@ int output_field_add(struct perf_hpp_list *list, char *tok)
 	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
 		struct sort_dimension *sd = &common_sort_dimensions[i];
 
-		if (strncasecmp(tok, sd->name, strlen(tok)))
+		if (!sd->name || strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		return __sort_dimension__add_output(list, sd);
@@ -3357,7 +3395,7 @@ int output_field_add(struct perf_hpp_list *list, char *tok)
 	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
 		struct sort_dimension *sd = &bstack_sort_dimensions[i];
 
-		if (strncasecmp(tok, sd->name, strlen(tok)))
+		if (!sd->name || strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		if (sort__mode != SORT_MODE__BRANCH)
@@ -3369,7 +3407,7 @@ int output_field_add(struct perf_hpp_list *list, char *tok)
 	for (i = 0; i < ARRAY_SIZE(memory_sort_dimensions); i++) {
 		struct sort_dimension *sd = &memory_sort_dimensions[i];
 
-		if (strncasecmp(tok, sd->name, strlen(tok)))
+		if (!sd->name || strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		if (sort__mode != SORT_MODE__MEMORY)
@@ -3508,6 +3546,9 @@ void reset_output_field(void)
 
 static void add_key(struct strbuf *sb, const char *str, int *llen)
 {
+	if (!str)
+		return;
+
 	if (*llen >= 75) {
 		strbuf_addstr(sb, "\n\t\t\t ");
 		*llen = INDENT;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index ba66bb7fc1ca..8bd8b0142630 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -25,41 +25,124 @@
 #define CNTR_NOT_SUPPORTED	"<not supported>"
 #define CNTR_NOT_COUNTED	"<not counted>"
 
-static void print_running(struct perf_stat_config *config,
-			  u64 run, u64 ena)
+#define METRIC_LEN   38
+#define EVNAME_LEN   32
+#define COUNTS_LEN   18
+#define INTERVAL_LEN 16
+#define CGROUP_LEN   16
+#define COMM_LEN     16
+#define PID_LEN       7
+#define CPUS_LEN      4
+
+static int aggr_header_lens[] = {
+	[AGGR_CORE] 	= 18,
+	[AGGR_DIE] 	= 12,
+	[AGGR_SOCKET] 	= 6,
+	[AGGR_NODE] 	= 6,
+	[AGGR_NONE] 	= 6,
+	[AGGR_THREAD] 	= 16,
+	[AGGR_GLOBAL] 	= 0,
+};
+
+static const char *aggr_header_csv[] = {
+	[AGGR_CORE] 	= 	"core,cpus,",
+	[AGGR_DIE] 	= 	"die,cpus,",
+	[AGGR_SOCKET] 	= 	"socket,cpus,",
+	[AGGR_NONE] 	= 	"cpu,",
+	[AGGR_THREAD] 	= 	"comm-pid,",
+	[AGGR_NODE] 	= 	"node,",
+	[AGGR_GLOBAL] 	=	""
+};
+
+static const char *aggr_header_std[] = {
+	[AGGR_CORE] 	= 	"core",
+	[AGGR_DIE] 	= 	"die",
+	[AGGR_SOCKET] 	= 	"socket",
+	[AGGR_NONE] 	= 	"cpu",
+	[AGGR_THREAD] 	= 	"comm-pid",
+	[AGGR_NODE] 	= 	"node",
+	[AGGR_GLOBAL] 	=	""
+};
+
+static void print_running_std(struct perf_stat_config *config, u64 run, u64 ena)
 {
+	if (run != ena)
+		fprintf(config->output, "  (%.2f%%)", 100.0 * run / ena);
+}
 
+static void print_running_csv(struct perf_stat_config *config, u64 run, u64 ena)
+{
 	double enabled_percent = 100;
 
 	if (run != ena)
 		enabled_percent = 100 * run / ena;
-	if (config->json_output)
-		fprintf(config->output,
-			"\"event-runtime\" : %" PRIu64 ", \"pcnt-running\" : %.2f, ",
-			run, enabled_percent);
-	else if (config->csv_output)
-		fprintf(config->output,
-			"%s%" PRIu64 "%s%.2f", config->csv_sep,
-			run, config->csv_sep, enabled_percent);
-	else if (run != ena)
-		fprintf(config->output, "  (%.2f%%)", 100.0 * run / ena);
+	fprintf(config->output, "%s%" PRIu64 "%s%.2f",
+		config->csv_sep, run, config->csv_sep, enabled_percent);
+}
+
+static void print_running_json(struct perf_stat_config *config, u64 run, u64 ena)
+{
+	double enabled_percent = 100;
+
+	if (run != ena)
+		enabled_percent = 100 * run / ena;
+	fprintf(config->output, "\"event-runtime\" : %" PRIu64 ", \"pcnt-running\" : %.2f, ",
+		run, enabled_percent);
+}
+
+static void print_running(struct perf_stat_config *config,
+			  u64 run, u64 ena, bool before_metric)
+{
+	if (config->json_output) {
+		if (before_metric)
+			print_running_json(config, run, ena);
+	} else if (config->csv_output) {
+		if (before_metric)
+			print_running_csv(config, run, ena);
+	} else {
+		if (!before_metric)
+			print_running_std(config, run, ena);
+	}
+}
+
+static void print_noise_pct_std(struct perf_stat_config *config,
+				double pct)
+{
+	if (pct)
+		fprintf(config->output, "  ( +-%6.2f%% )", pct);
+}
+
+static void print_noise_pct_csv(struct perf_stat_config *config,
+				double pct)
+{
+	fprintf(config->output, "%s%.2f%%", config->csv_sep, pct);
+}
+
+static void print_noise_pct_json(struct perf_stat_config *config,
+				 double pct)
+{
+	fprintf(config->output, "\"variance\" : %.2f, ", pct);
 }
 
 static void print_noise_pct(struct perf_stat_config *config,
-			    double total, double avg)
+			    double total, double avg, bool before_metric)
 {
 	double pct = rel_stddev_stats(total, avg);
 
-	if (config->json_output)
-		fprintf(config->output, "\"variance\" : %.2f, ", pct);
-	else if (config->csv_output)
-		fprintf(config->output, "%s%.2f%%", config->csv_sep, pct);
-	else if (pct)
-		fprintf(config->output, "  ( +-%6.2f%% )", pct);
+	if (config->json_output) {
+		if (before_metric)
+			print_noise_pct_json(config, pct);
+	} else if (config->csv_output) {
+		if (before_metric)
+			print_noise_pct_csv(config, pct);
+	} else {
+		if (!before_metric)
+			print_noise_pct_std(config, pct);
+	}
 }
 
 static void print_noise(struct perf_stat_config *config,
-			struct evsel *evsel, double avg)
+			struct evsel *evsel, double avg, bool before_metric)
 {
 	struct perf_stat_evsel *ps;
 
@@ -67,139 +150,166 @@ static void print_noise(struct perf_stat_config *config,
 		return;
 
 	ps = evsel->stats;
-	print_noise_pct(config, stddev_stats(&ps->res_stats), avg);
+	print_noise_pct(config, stddev_stats(&ps->res_stats), avg, before_metric);
 }
 
-static void print_cgroup(struct perf_stat_config *config, struct evsel *evsel)
+static void print_cgroup_std(struct perf_stat_config *config, const char *cgrp_name)
 {
-	if (nr_cgroups) {
-		const char *cgrp_name = evsel->cgrp ? evsel->cgrp->name  : "";
+	fprintf(config->output, " %-*s", CGROUP_LEN, cgrp_name);
+}
+
+static void print_cgroup_csv(struct perf_stat_config *config, const char *cgrp_name)
+{
+	fprintf(config->output, "%s%s", config->csv_sep, cgrp_name);
+}
+
+static void print_cgroup_json(struct perf_stat_config *config, const char *cgrp_name)
+{
+	fprintf(config->output, "\"cgroup\" : \"%s\", ", cgrp_name);
+}
+
+static void print_cgroup(struct perf_stat_config *config, struct cgroup *cgrp)
+{
+	if (nr_cgroups || config->cgroup_list) {
+		const char *cgrp_name = cgrp ? cgrp->name  : "";
 
 		if (config->json_output)
-			fprintf(config->output, "\"cgroup\" : \"%s\", ", cgrp_name);
+			print_cgroup_json(config, cgrp_name);
+		else if (config->csv_output)
+			print_cgroup_csv(config, cgrp_name);
 		else
-			fprintf(config->output, "%s%s", config->csv_sep, cgrp_name);
+			print_cgroup_std(config, cgrp_name);
 	}
 }
 
-
-static void aggr_printout(struct perf_stat_config *config,
-			  struct evsel *evsel, struct aggr_cpu_id id, int nr)
+static void print_aggr_id_std(struct perf_stat_config *config,
+			      struct evsel *evsel, struct aggr_cpu_id id, int nr)
 {
+	FILE *output = config->output;
+	int idx = config->aggr_mode;
+	char buf[128];
+
+	switch (config->aggr_mode) {
+	case AGGR_CORE:
+		snprintf(buf, sizeof(buf), "S%d-D%d-C%d", id.socket, id.die, id.core);
+		break;
+	case AGGR_DIE:
+		snprintf(buf, sizeof(buf), "S%d-D%d", id.socket, id.die);
+		break;
+	case AGGR_SOCKET:
+		snprintf(buf, sizeof(buf), "S%d", id.socket);
+		break;
+	case AGGR_NODE:
+		snprintf(buf, sizeof(buf), "N%d", id.node);
+		break;
+	case AGGR_NONE:
+		if (evsel->percore && !config->percore_show_thread) {
+			snprintf(buf, sizeof(buf), "S%d-D%d-C%d ",
+				id.socket, id.die, id.core);
+			fprintf(output, "%-*s ",
+				aggr_header_lens[AGGR_CORE], buf);
+		} else if (id.cpu.cpu > -1) {
+			fprintf(output, "CPU%-*d ",
+				aggr_header_lens[AGGR_NONE] - 3, id.cpu.cpu);
+		}
+		return;
+	case AGGR_THREAD:
+		fprintf(output, "%*s-%-*d ",
+			COMM_LEN, perf_thread_map__comm(evsel->core.threads, id.thread_idx),
+			PID_LEN, perf_thread_map__pid(evsel->core.threads, id.thread_idx));
+		return;
+	case AGGR_GLOBAL:
+	case AGGR_UNSET:
+	case AGGR_MAX:
+	default:
+		return;
+	}
 
+	fprintf(output, "%-*s %*d ", aggr_header_lens[idx], buf, 4, nr);
+}
 
-	if (config->json_output && !config->interval)
-		fprintf(config->output, "{");
+static void print_aggr_id_csv(struct perf_stat_config *config,
+			      struct evsel *evsel, struct aggr_cpu_id id, int nr)
+{
+	FILE *output = config->output;
+	const char *sep = config->csv_sep;
 
 	switch (config->aggr_mode) {
 	case AGGR_CORE:
-		if (config->json_output) {
-			fprintf(config->output,
-				"\"core\" : \"S%d-D%d-C%d\", \"aggregate-number\" : %d, ",
-				id.socket,
-				id.die,
-				id.core,
-				nr);
-		} else {
-			fprintf(config->output, "S%d-D%d-C%*d%s%*d%s",
-				id.socket,
-				id.die,
-				config->csv_output ? 0 : -8,
-				id.core,
-				config->csv_sep,
-				config->csv_output ? 0 : 4,
-				nr,
-				config->csv_sep);
-		}
+		fprintf(output, "S%d-D%d-C%d%s%d%s",
+			id.socket, id.die, id.core, sep, nr, sep);
 		break;
 	case AGGR_DIE:
-		if (config->json_output) {
-			fprintf(config->output,
-				"\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ",
-				id.socket,
-				id.die,
-				nr);
-		} else {
-			fprintf(config->output, "S%d-D%*d%s%*d%s",
-				id.socket,
-				config->csv_output ? 0 : -8,
-				id.die,
-				config->csv_sep,
-				config->csv_output ? 0 : 4,
-				nr,
-				config->csv_sep);
-		}
+		fprintf(output, "S%d-D%d%s%d%s",
+			id.socket, id.die, sep, nr, sep);
 		break;
 	case AGGR_SOCKET:
-		if (config->json_output) {
-			fprintf(config->output,
-				"\"socket\" : \"S%d\", \"aggregate-number\" : %d, ",
-				id.socket,
-				nr);
-		} else {
-			fprintf(config->output, "S%*d%s%*d%s",
-				config->csv_output ? 0 : -5,
-				id.socket,
-				config->csv_sep,
-				config->csv_output ? 0 : 4,
-				nr,
-				config->csv_sep);
-		}
+		fprintf(output, "S%d%s%d%s",
+			id.socket, sep, nr, sep);
 		break;
 	case AGGR_NODE:
-		if (config->json_output) {
-			fprintf(config->output, "\"node\" : \"N%d\", \"aggregate-number\" : %d, ",
-				id.node,
-				nr);
-		} else {
-			fprintf(config->output, "N%*d%s%*d%s",
-				config->csv_output ? 0 : -5,
-				id.node,
-				config->csv_sep,
-				config->csv_output ? 0 : 4,
-				nr,
-				config->csv_sep);
-		}
+		fprintf(output, "N%d%s%d%s",
+			id.node, sep, nr, sep);
 		break;
 	case AGGR_NONE:
-		if (config->json_output) {
-			if (evsel->percore && !config->percore_show_thread) {
-				fprintf(config->output, "\"core\" : \"S%d-D%d-C%d\"",
-					id.socket,
-					id.die,
-					id.core);
-			} else if (id.cpu.cpu > -1) {
-				fprintf(config->output, "\"cpu\" : \"%d\", ",
-					id.cpu.cpu);
-			}
-		} else {
-			if (evsel->percore && !config->percore_show_thread) {
-				fprintf(config->output, "S%d-D%d-C%*d%s",
-					id.socket,
-					id.die,
-					config->csv_output ? 0 : -3,
-					id.core, config->csv_sep);
-			} else if (id.cpu.cpu > -1) {
-				fprintf(config->output, "CPU%*d%s",
-					config->csv_output ? 0 : -7,
-					id.cpu.cpu, config->csv_sep);
-			}
+		if (evsel->percore && !config->percore_show_thread) {
+			fprintf(output, "S%d-D%d-C%d%s",
+				id.socket, id.die, id.core, sep);
+		} else if (id.cpu.cpu > -1) {
+			fprintf(output, "CPU%d%s",
+				id.cpu.cpu, sep);
 		}
 		break;
 	case AGGR_THREAD:
-		if (config->json_output) {
-			fprintf(config->output, "\"thread\" : \"%s-%d\", ",
-				perf_thread_map__comm(evsel->core.threads, id.thread_idx),
-				perf_thread_map__pid(evsel->core.threads, id.thread_idx));
-		} else {
-			fprintf(config->output, "%*s-%*d%s",
-				config->csv_output ? 0 : 16,
-				perf_thread_map__comm(evsel->core.threads, id.thread_idx),
-				config->csv_output ? 0 : -8,
-				perf_thread_map__pid(evsel->core.threads, id.thread_idx),
-				config->csv_sep);
+		fprintf(output, "%s-%d%s",
+			perf_thread_map__comm(evsel->core.threads, id.thread_idx),
+			perf_thread_map__pid(evsel->core.threads, id.thread_idx),
+			sep);
+		break;
+	case AGGR_GLOBAL:
+	case AGGR_UNSET:
+	case AGGR_MAX:
+	default:
+		break;
+	}
+}
+
+static void print_aggr_id_json(struct perf_stat_config *config,
+			       struct evsel *evsel, struct aggr_cpu_id id, int nr)
+{
+	FILE *output = config->output;
+
+	switch (config->aggr_mode) {
+	case AGGR_CORE:
+		fprintf(output, "\"core\" : \"S%d-D%d-C%d\", \"aggregate-number\" : %d, ",
+			id.socket, id.die, id.core, nr);
+		break;
+	case AGGR_DIE:
+		fprintf(output, "\"die\" : \"S%d-D%d\", \"aggregate-number\" : %d, ",
+			id.socket, id.die, nr);
+		break;
+	case AGGR_SOCKET:
+		fprintf(output, "\"socket\" : \"S%d\", \"aggregate-number\" : %d, ",
+			id.socket, nr);
+		break;
+	case AGGR_NODE:
+		fprintf(output, "\"node\" : \"N%d\", \"aggregate-number\" : %d, ",
+			id.node, nr);
+		break;
+	case AGGR_NONE:
+		if (evsel->percore && !config->percore_show_thread) {
+			fprintf(output, "\"core\" : \"S%d-D%d-C%d\"",
+				id.socket, id.die, id.core);
+		} else if (id.cpu.cpu > -1) {
+			fprintf(output, "\"cpu\" : \"%d\", ",
+				id.cpu.cpu);
 		}
 		break;
+	case AGGR_THREAD:
+		fprintf(output, "\"thread\" : \"%s-%d\", ",
+			perf_thread_map__comm(evsel->core.threads, id.thread_idx),
+			perf_thread_map__pid(evsel->core.threads, id.thread_idx));
+		break;
 	case AGGR_GLOBAL:
 	case AGGR_UNSET:
 	case AGGR_MAX:
@@ -208,18 +318,29 @@ static void aggr_printout(struct perf_stat_config *config,
 	}
 }
 
+static void aggr_printout(struct perf_stat_config *config,
+			  struct evsel *evsel, struct aggr_cpu_id id, int nr)
+{
+	if (config->json_output)
+		print_aggr_id_json(config, evsel, id, nr);
+	else if (config->csv_output)
+		print_aggr_id_csv(config, evsel, id, nr);
+	else
+		print_aggr_id_std(config, evsel, id, nr);
+}
+
 struct outstate {
 	FILE *fh;
 	bool newline;
+	bool first;
 	const char *prefix;
 	int  nfields;
 	int  nr;
 	struct aggr_cpu_id id;
 	struct evsel *evsel;
+	struct cgroup *cgrp;
 };
 
-#define METRIC_LEN  35
-
 static void new_line_std(struct perf_stat_config *config __maybe_unused,
 			 void *ctx)
 {
@@ -232,7 +353,8 @@ static void do_new_line_std(struct perf_stat_config *config,
 			    struct outstate *os)
 {
 	fputc('\n', os->fh);
-	fputs(os->prefix, os->fh);
+	if (os->prefix)
+		fputs(os->prefix, os->fh);
 	aggr_printout(config, os->evsel, os->id, os->nr);
 	if (config->aggr_mode == AGGR_NONE)
 		fprintf(os->fh, "        ");
@@ -319,7 +441,7 @@ static void new_line_json(struct perf_stat_config *config, void *ctx)
 {
 	struct outstate *os = ctx;
 
-	fputc('\n', os->fh);
+	fputs("\n{", os->fh);
 	if (os->prefix)
 		fprintf(os->fh, "%s", os->prefix);
 	aggr_printout(config, os->evsel, os->id, os->nr);
@@ -368,6 +490,7 @@ static void print_metric_only(struct perf_stat_config *config,
 
 	color_snprintf(str, sizeof(str), color ?: "", fmt, val);
 	fprintf(out, "%*s ", mlen, str);
+	os->first = false;
 }
 
 static void print_metric_only_csv(struct perf_stat_config *config __maybe_unused,
@@ -389,6 +512,7 @@ static void print_metric_only_csv(struct perf_stat_config *config __maybe_unused
 		ends++;
 	*ends = 0;
 	fprintf(out, "%s%s", vals, config->csv_sep);
+	os->first = false;
 }
 
 static void print_metric_only_json(struct perf_stat_config *config __maybe_unused,
@@ -409,7 +533,10 @@ static void print_metric_only_json(struct perf_stat_config *config __maybe_unuse
 	while (isdigit(*ends) || *ends == '.')
 		ends++;
 	*ends = 0;
-	fprintf(out, "{\"metric-value\" : \"%s\"}", vals);
+	if (!unit[0] || !vals[0])
+		return;
+	fprintf(out, "%s\"%s\" : \"%s\"", os->first ? "" : ", ", unit, vals);
+	os->first = false;
 }
 
 static void new_line_metric(struct perf_stat_config *config __maybe_unused,
@@ -430,84 +557,100 @@ static void print_metric_header(struct perf_stat_config *config,
 	    os->evsel->priv != os->evsel->evlist->selected->priv)
 		return;
 
-	if (!valid_only_metric(unit) && !config->json_output)
+	if (os->evsel->cgrp != os->cgrp)
+		return;
+
+	if (!valid_only_metric(unit))
 		return;
 	unit = fixunit(tbuf, os->evsel, unit);
 
 	if (config->json_output)
-		fprintf(os->fh, "\"unit\" : \"%s\"", unit);
+		return;
 	else if (config->csv_output)
 		fprintf(os->fh, "%s%s", unit, config->csv_sep);
 	else
 		fprintf(os->fh, "%*s ", config->metric_only_len, unit);
 }
 
-static int first_shadow_map_idx(struct perf_stat_config *config,
-				struct evsel *evsel, const struct aggr_cpu_id *id)
+static void print_counter_value_std(struct perf_stat_config *config,
+				    struct evsel *evsel, double avg, bool ok)
 {
-	struct perf_cpu_map *cpus = evsel__cpus(evsel);
-	struct perf_cpu cpu;
-	int idx;
-
-	if (config->aggr_mode == AGGR_NONE)
-		return perf_cpu_map__idx(cpus, id->cpu);
+	FILE *output = config->output;
+	double sc =  evsel->scale;
+	const char *fmt;
+	const char *bad_count = evsel->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED;
 
-	if (config->aggr_mode == AGGR_THREAD)
-		return id->thread_idx;
+	if (config->big_num)
+		fmt = floor(sc) != sc ? "%'*.2f " : "%'*.0f ";
+	else
+		fmt = floor(sc) != sc ? "%*.2f " : "%*.0f ";
 
-	if (!config->aggr_get_id)
-		return 0;
+	if (ok)
+		fprintf(output, fmt, COUNTS_LEN, avg);
+	else
+		fprintf(output, "%*s ", COUNTS_LEN, bad_count);
 
-	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-		struct aggr_cpu_id cpu_id = config->aggr_get_id(config, cpu);
+	if (evsel->unit)
+		fprintf(output, "%-*s ", config->unit_width, evsel->unit);
 
-		if (aggr_cpu_id__equal(&cpu_id, id))
-			return idx;
-	}
-	return 0;
+	fprintf(output, "%-*s", EVNAME_LEN, evsel__name(evsel));
 }
 
-static void abs_printout(struct perf_stat_config *config,
-			 struct aggr_cpu_id id, int nr, struct evsel *evsel, double avg)
+static void print_counter_value_csv(struct perf_stat_config *config,
+				    struct evsel *evsel, double avg, bool ok)
 {
 	FILE *output = config->output;
 	double sc =  evsel->scale;
-	const char *fmt;
+	const char *sep = config->csv_sep;
+	const char *fmt = floor(sc) != sc ? "%.2f%s" : "%.0f%s";
+	const char *bad_count = evsel->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED;
 
-	if (config->csv_output) {
-		fmt = floor(sc) != sc ?  "%.2f%s" : "%.0f%s";
-	} else {
-		if (config->big_num)
-			fmt = floor(sc) != sc ? "%'18.2f%s" : "%'18.0f%s";
-		else
-			fmt = floor(sc) != sc ? "%18.2f%s" : "%18.0f%s";
-	}
+	if (ok)
+		fprintf(output, fmt, avg, sep);
+	else
+		fprintf(output, "%s%s", bad_count, sep);
 
-	aggr_printout(config, evsel, id, nr);
+	if (evsel->unit)
+		fprintf(output, "%s%s", evsel->unit, sep);
 
-	if (config->json_output)
+	fprintf(output, "%s", evsel__name(evsel));
+}
+
+static void print_counter_value_json(struct perf_stat_config *config,
+				     struct evsel *evsel, double avg, bool ok)
+{
+	FILE *output = config->output;
+	const char *bad_count = evsel->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED;
+
+	if (ok)
 		fprintf(output, "\"counter-value\" : \"%f\", ", avg);
 	else
-		fprintf(output, fmt, avg, config->csv_sep);
+		fprintf(output, "\"counter-value\" : \"%s\", ", bad_count);
 
-	if (config->json_output) {
-		if (evsel->unit) {
-			fprintf(output, "\"unit\" : \"%s\", ",
-				evsel->unit);
-		}
-	} else {
-		if (evsel->unit)
-			fprintf(output, "%-*s%s",
-				config->csv_output ? 0 : config->unit_width,
-				evsel->unit, config->csv_sep);
-	}
+	if (evsel->unit)
+		fprintf(output, "\"unit\" : \"%s\", ", evsel->unit);
 
+	fprintf(output, "\"event\" : \"%s\", ", evsel__name(evsel));
+}
+
+static void print_counter_value(struct perf_stat_config *config,
+				struct evsel *evsel, double avg, bool ok)
+{
 	if (config->json_output)
-		fprintf(output, "\"event\" : \"%s\", ", evsel__name(evsel));
+		print_counter_value_json(config, evsel, avg, ok);
+	else if (config->csv_output)
+		print_counter_value_csv(config, evsel, avg, ok);
 	else
-		fprintf(output, "%-*s", config->csv_output ? 0 : 32, evsel__name(evsel));
+		print_counter_value_std(config, evsel, avg, ok);
+}
 
-	print_cgroup(config, evsel);
+static void abs_printout(struct perf_stat_config *config,
+			 struct aggr_cpu_id id, int nr,
+			 struct evsel *evsel, double avg, bool ok)
+{
+	aggr_printout(config, evsel, id, nr);
+	print_counter_value(config, evsel, avg, ok);
+	print_cgroup(config, evsel->cgrp);
 }
 
 static bool is_mixed_hw_group(struct evsel *counter)
@@ -534,37 +677,19 @@ static bool is_mixed_hw_group(struct evsel *counter)
 	return false;
 }
 
-static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int nr,
-		     struct evsel *counter, double uval,
-		     char *prefix, u64 run, u64 ena, double noise,
-		     struct runtime_stat *st)
+static void printout(struct perf_stat_config *config, struct outstate *os,
+		     double uval, u64 run, u64 ena, double noise, int map_idx)
 {
 	struct perf_stat_output_ctx out;
-	struct outstate os = {
-		.fh = config->output,
-		.prefix = prefix ? prefix : "",
-		.id = id,
-		.nr = nr,
-		.evsel = counter,
-	};
 	print_metric_t pm;
 	new_line_t nl;
+	bool ok = true;
+	struct evsel *counter = os->evsel;
 
 	if (config->csv_output) {
-		static const int aggr_fields[AGGR_MAX] = {
-			[AGGR_NONE] = 1,
-			[AGGR_GLOBAL] = 0,
-			[AGGR_SOCKET] = 2,
-			[AGGR_DIE] = 2,
-			[AGGR_CORE] = 2,
-			[AGGR_THREAD] = 1,
-			[AGGR_UNSET] = 0,
-			[AGGR_NODE] = 1,
-		};
-
 		pm = config->metric_only ? print_metric_only_csv : print_metric_csv;
 		nl = config->metric_only ? new_line_metric : new_line_csv;
-		os.nfields = 3 + aggr_fields[config->aggr_mode] + (counter->cgrp ? 1 : 0);
+		os->nfields = 4 + (counter->cgrp ? 1 : 0);
 	} else if (config->json_output) {
 		pm = config->metric_only ? print_metric_only_json : print_metric_json;
 		nl = config->metric_only ? new_line_metric : new_line_json;
@@ -573,27 +698,13 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 		nl = config->metric_only ? new_line_metric : new_line_std;
 	}
 
-	if (!config->no_csv_summary && config->csv_output &&
-	    config->summary && !config->interval) {
-		fprintf(config->output, "%16s%s", "summary", config->csv_sep);
-	}
-
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
 		if (config->metric_only) {
-			pm(config, &os, NULL, "", "", 0);
+			pm(config, os, NULL, "", "", 0);
 			return;
 		}
-		aggr_printout(config, counter, id, nr);
 
-		if (config->json_output) {
-			fprintf(config->output, "\"counter-value\" : \"%s\", ",
-					counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED);
-		} else {
-			fprintf(config->output, "%*s%s",
-				config->csv_output ? 0 : 18,
-				counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-				config->csv_sep);
-		}
+		ok = false;
 
 		if (counter->supported) {
 			if (!evlist__has_hybrid(counter->evlist)) {
@@ -602,86 +713,30 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 					config->print_mixed_hw_group_error = 1;
 			}
 		}
-
-		if (config->json_output) {
-			fprintf(config->output, "\"unit\" : \"%s\", ", counter->unit);
-		} else {
-			fprintf(config->output, "%-*s%s",
-				config->csv_output ? 0 : config->unit_width,
-				counter->unit, config->csv_sep);
-		}
-
-		if (config->json_output) {
-			fprintf(config->output, "\"event\" : \"%s\", ",
-				evsel__name(counter));
-		} else {
-			fprintf(config->output, "%*s",
-				 config->csv_output ? 0 : -25, evsel__name(counter));
-		}
-
-		print_cgroup(config, counter);
-
-		if (!config->csv_output && !config->json_output)
-			pm(config, &os, NULL, NULL, "", 0);
-		print_noise(config, counter, noise);
-		print_running(config, run, ena);
-		if (config->csv_output)
-			pm(config, &os, NULL, NULL, "", 0);
-		else if (config->json_output)
-			pm(config, &os, NULL, NULL, "", 0);
-		return;
 	}
 
-	if (!config->metric_only)
-		abs_printout(config, id, nr, counter, uval);
-
 	out.print_metric = pm;
 	out.new_line = nl;
-	out.ctx = &os;
+	out.ctx = os;
 	out.force_header = false;
 
-	if (config->csv_output && !config->metric_only) {
-		print_noise(config, counter, noise);
-		print_running(config, run, ena);
-	} else if (config->json_output && !config->metric_only) {
-		print_noise(config, counter, noise);
-		print_running(config, run, ena);
-	}
+	if (!config->metric_only) {
+		abs_printout(config, os->id, os->nr, counter, uval, ok);
 
-	perf_stat__print_shadow_stats(config, counter, uval,
-				first_shadow_map_idx(config, counter, &id),
-				&out, &config->metric_events, st);
-	if (!config->csv_output && !config->metric_only && !config->json_output) {
-		print_noise(config, counter, noise);
-		print_running(config, run, ena);
+		print_noise(config, counter, noise, /*before_metric=*/true);
+		print_running(config, run, ena, /*before_metric=*/true);
 	}
-}
 
-static void aggr_update_shadow(struct perf_stat_config *config,
-			       struct evlist *evlist)
-{
-	int idx, s;
-	struct perf_cpu cpu;
-	struct aggr_cpu_id s2, id;
-	u64 val;
-	struct evsel *counter;
-	struct perf_cpu_map *cpus;
+	if (ok) {
+		perf_stat__print_shadow_stats(config, counter, uval, map_idx,
+					      &out, &config->metric_events, &rt_stat);
+	} else {
+		pm(config, os, /*color=*/NULL, /*format=*/NULL, /*unit=*/"", /*val=*/0);
+	}
 
-	for (s = 0; s < config->aggr_map->nr; s++) {
-		id = config->aggr_map->map[s];
-		evlist__for_each_entry(evlist, counter) {
-			cpus = evsel__cpus(counter);
-			val = 0;
-			perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-				s2 = config->aggr_get_id(config, cpu);
-				if (!aggr_cpu_id__equal(&s2, &id))
-					continue;
-				val += perf_counts(counter->counts, idx, 0)->val;
-			}
-			perf_stat__update_shadow_stats(counter, val,
-					first_shadow_map_idx(config, counter, &id),
-					&rt_stat);
-		}
+	if (!config->metric_only) {
+		print_noise(config, counter, noise, /*before_metric=*/false);
+		print_running(config, run, ena, /*before_metric=*/false);
 	}
 }
 
@@ -704,7 +759,7 @@ static void uniquify_event_name(struct evsel *counter)
 			counter->name = new_name;
 		}
 	} else {
-		if (perf_pmu__has_hybrid()) {
+		if (evsel__is_hybrid(counter)) {
 			ret = asprintf(&new_name, "%s/%s/",
 				       counter->pmu_name, counter->name);
 		} else {
@@ -721,366 +776,180 @@ static void uniquify_event_name(struct evsel *counter)
 	counter->uniquified_name = true;
 }
 
-static void collect_all_aliases(struct perf_stat_config *config, struct evsel *counter,
-			    void (*cb)(struct perf_stat_config *config, struct evsel *counter, void *data,
-				       bool first),
-			    void *data)
-{
-	struct evlist *evlist = counter->evlist;
-	struct evsel *alias;
-
-	alias = list_prepare_entry(counter, &(evlist->core.entries), core.node);
-	list_for_each_entry_continue (alias, &evlist->core.entries, core.node) {
-		/* Merge events with the same name, etc. but on different PMUs. */
-		if (!strcmp(evsel__name(alias), evsel__name(counter)) &&
-			alias->scale == counter->scale &&
-			alias->cgrp == counter->cgrp &&
-			!strcmp(alias->unit, counter->unit) &&
-			evsel__is_clock(alias) == evsel__is_clock(counter) &&
-			strcmp(alias->pmu_name, counter->pmu_name)) {
-			alias->merged_stat = true;
-			cb(config, alias, data, false);
-		}
-	}
-}
-
-static bool is_uncore(struct evsel *evsel)
+static bool hybrid_uniquify(struct evsel *evsel, struct perf_stat_config *config)
 {
-	struct perf_pmu *pmu = evsel__find_pmu(evsel);
-
-	return pmu && pmu->is_uncore;
+	return evsel__is_hybrid(evsel) && !config->hybrid_merge;
 }
 
-static bool hybrid_uniquify(struct evsel *evsel)
+static void uniquify_counter(struct perf_stat_config *config, struct evsel *counter)
 {
-	return perf_pmu__has_hybrid() && !is_uncore(evsel);
+	if (config->no_merge || hybrid_uniquify(counter, config))
+		uniquify_event_name(counter);
 }
 
-static bool hybrid_merge(struct evsel *counter, struct perf_stat_config *config,
-			 bool check)
+static void print_counter_aggrdata(struct perf_stat_config *config,
+				   struct evsel *counter, int s,
+				   struct outstate *os)
 {
-	if (hybrid_uniquify(counter)) {
-		if (check)
-			return config && config->hybrid_merge;
-		else
-			return config && !config->hybrid_merge;
-	}
+	FILE *output = config->output;
+	u64 ena, run, val;
+	double uval;
+	struct perf_stat_evsel *ps = counter->stats;
+	struct perf_stat_aggr *aggr = &ps->aggr[s];
+	struct aggr_cpu_id id = config->aggr_map->map[s];
+	double avg = aggr->counts.val;
+	bool metric_only = config->metric_only;
 
-	return false;
-}
+	os->id = id;
+	os->nr = aggr->nr;
+	os->evsel = counter;
 
-static bool collect_data(struct perf_stat_config *config, struct evsel *counter,
-			    void (*cb)(struct perf_stat_config *config, struct evsel *counter, void *data,
-				       bool first),
-			    void *data)
-{
+	/* Skip already merged uncore/hybrid events */
 	if (counter->merged_stat)
-		return false;
-	cb(config, counter, data, true);
-	if (config->no_merge || hybrid_merge(counter, config, false))
-		uniquify_event_name(counter);
-	else if (counter->auto_merge_stats || hybrid_merge(counter, config, true))
-		collect_all_aliases(config, counter, cb, data);
-	return true;
-}
+		return;
 
-struct aggr_data {
-	u64 ena, run, val;
-	struct aggr_cpu_id id;
-	int nr;
-	int cpu_map_idx;
-};
+	uniquify_counter(config, counter);
 
-static void aggr_cb(struct perf_stat_config *config,
-		    struct evsel *counter, void *data, bool first)
-{
-	struct aggr_data *ad = data;
-	int idx;
-	struct perf_cpu cpu;
-	struct perf_cpu_map *cpus;
-	struct aggr_cpu_id s2;
+	val = aggr->counts.val;
+	ena = aggr->counts.ena;
+	run = aggr->counts.run;
 
-	cpus = evsel__cpus(counter);
-	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-		struct perf_counts_values *counts;
+	/*
+	 * Skip value 0 when enabling --per-thread globally, otherwise it will
+	 * have too many 0 output.
+	 */
+	if (val == 0 && config->aggr_mode == AGGR_THREAD && config->system_wide)
+		return;
 
-		s2 = config->aggr_get_id(config, cpu);
-		if (!aggr_cpu_id__equal(&s2, &ad->id))
-			continue;
-		if (first)
-			ad->nr++;
-		counts = perf_counts(counter->counts, idx, 0);
-		/*
-		 * When any result is bad, make them all to give
-		 * consistent output in interval mode.
-		 */
-		if (counts->ena == 0 || counts->run == 0 ||
-		    counter->counts->scaled == -1) {
-			ad->ena = 0;
-			ad->run = 0;
-			break;
-		}
-		ad->val += counts->val;
-		ad->ena += counts->ena;
-		ad->run += counts->run;
+	if (!metric_only) {
+		if (config->json_output)
+			fputc('{', output);
+		if (os->prefix)
+			fprintf(output, "%s", os->prefix);
+		else if (config->summary && config->csv_output &&
+			 !config->no_csv_summary && !config->interval)
+			fprintf(output, "%s%s", "summary", config->csv_sep);
 	}
+
+	uval = val * counter->scale;
+
+	printout(config, os, uval, run, ena, avg, s);
+
+	if (!metric_only)
+		fputc('\n', output);
 }
 
-static void print_counter_aggrdata(struct perf_stat_config *config,
-				   struct evsel *counter, int s,
-				   char *prefix, bool metric_only,
-				   bool *first, struct perf_cpu cpu)
+static void print_metric_begin(struct perf_stat_config *config,
+			       struct evlist *evlist,
+			       struct outstate *os, int aggr_idx)
 {
-	struct aggr_data ad;
-	FILE *output = config->output;
-	u64 ena, run, val;
-	int nr;
+	struct perf_stat_aggr *aggr;
 	struct aggr_cpu_id id;
-	double uval;
+	struct evsel *evsel;
 
-	ad.id = id = config->aggr_map->map[s];
-	ad.val = ad.ena = ad.run = 0;
-	ad.nr = 0;
-	if (!collect_data(config, counter, aggr_cb, &ad))
+	os->first = true;
+	if (!config->metric_only)
 		return;
 
-	if (perf_pmu__has_hybrid() && ad.ena == 0)
-		return;
+	if (config->json_output)
+		fputc('{', config->output);
+	if (os->prefix)
+		fprintf(config->output, "%s", os->prefix);
 
-	nr = ad.nr;
-	ena = ad.ena;
-	run = ad.run;
-	val = ad.val;
-	if (*first && metric_only) {
-		*first = false;
-		aggr_printout(config, counter, id, nr);
-	}
-	if (prefix && !metric_only)
-		fprintf(output, "%s", prefix);
+	evsel = evlist__first(evlist);
+	id = config->aggr_map->map[aggr_idx];
+	aggr = &evsel->stats->aggr[aggr_idx];
+	aggr_printout(config, evsel, id, aggr->nr);
 
-	uval = val * counter->scale;
-	if (cpu.cpu != -1)
-		id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
+	print_cgroup(config, os->cgrp ? : evsel->cgrp);
+}
 
-	printout(config, id, nr, counter, uval,
-		 prefix, run, ena, 1.0, &rt_stat);
-	if (!metric_only)
-		fputc('\n', output);
+static void print_metric_end(struct perf_stat_config *config, struct outstate *os)
+{
+	FILE *output = config->output;
+
+	if (!config->metric_only)
+		return;
+
+	if (config->json_output) {
+		if (os->first)
+			fputs("\"metric-value\" : \"none\"", output);
+		fputc('}', output);
+	}
+	fputc('\n', output);
 }
 
 static void print_aggr(struct perf_stat_config *config,
 		       struct evlist *evlist,
-		       char *prefix)
+		       struct outstate *os)
 {
-	bool metric_only = config->metric_only;
-	FILE *output = config->output;
 	struct evsel *counter;
 	int s;
-	bool first;
 
 	if (!config->aggr_map || !config->aggr_get_id)
 		return;
 
-	aggr_update_shadow(config, evlist);
-
 	/*
 	 * With metric_only everything is on a single line.
 	 * Without each counter has its own line.
 	 */
 	for (s = 0; s < config->aggr_map->nr; s++) {
-		if (prefix && metric_only)
-			fprintf(output, "%s", prefix);
+		print_metric_begin(config, evlist, os, s);
 
-		first = true;
 		evlist__for_each_entry(evlist, counter) {
-			print_counter_aggrdata(config, counter, s,
-					prefix, metric_only,
-					&first, (struct perf_cpu){ .cpu = -1 });
+			print_counter_aggrdata(config, counter, s, os);
 		}
-		if (metric_only)
-			fputc('\n', output);
+		print_metric_end(config, os);
 	}
 }
 
-static int cmp_val(const void *a, const void *b)
+static void print_aggr_cgroup(struct perf_stat_config *config,
+			      struct evlist *evlist,
+			      struct outstate *os)
 {
-	return ((struct perf_aggr_thread_value *)b)->val -
-		((struct perf_aggr_thread_value *)a)->val;
-}
-
-static struct perf_aggr_thread_value *sort_aggr_thread(
-					struct evsel *counter,
-					int *ret,
-					struct target *_target)
-{
-	int nthreads = perf_thread_map__nr(counter->core.threads);
-	int i = 0;
-	double uval;
-	struct perf_aggr_thread_value *buf;
-
-	buf = calloc(nthreads, sizeof(struct perf_aggr_thread_value));
-	if (!buf)
-		return NULL;
-
-	for (int thread = 0; thread < nthreads; thread++) {
-		int idx;
-		u64 ena = 0, run = 0, val = 0;
-
-		perf_cpu_map__for_each_idx(idx, evsel__cpus(counter)) {
-			struct perf_counts_values *counts =
-				perf_counts(counter->counts, idx, thread);
-
-			val += counts->val;
-			ena += counts->ena;
-			run += counts->run;
-		}
+	struct evsel *counter, *evsel;
+	int s;
 
-		uval = val * counter->scale;
+	if (!config->aggr_map || !config->aggr_get_id)
+		return;
 
-		/*
-		 * Skip value 0 when enabling --per-thread globally,
-		 * otherwise too many 0 output.
-		 */
-		if (uval == 0.0 && target__has_per_thread(_target))
+	evlist__for_each_entry(evlist, evsel) {
+		if (os->cgrp == evsel->cgrp)
 			continue;
 
-		buf[i].counter = counter;
-		buf[i].id = aggr_cpu_id__empty();
-		buf[i].id.thread_idx = thread;
-		buf[i].uval = uval;
-		buf[i].val = val;
-		buf[i].run = run;
-		buf[i].ena = ena;
-		i++;
-	}
+		os->cgrp = evsel->cgrp;
 
-	qsort(buf, i, sizeof(struct perf_aggr_thread_value), cmp_val);
-
-	if (ret)
-		*ret = i;
-
-	return buf;
-}
-
-static void print_aggr_thread(struct perf_stat_config *config,
-			      struct target *_target,
-			      struct evsel *counter, char *prefix)
-{
-	FILE *output = config->output;
-	int thread, sorted_threads;
-	struct aggr_cpu_id id;
-	struct perf_aggr_thread_value *buf;
-
-	buf = sort_aggr_thread(counter, &sorted_threads, _target);
-	if (!buf) {
-		perror("cannot sort aggr thread");
-		return;
-	}
+		for (s = 0; s < config->aggr_map->nr; s++) {
+			print_metric_begin(config, evlist, os, s);
 
-	for (thread = 0; thread < sorted_threads; thread++) {
-		if (prefix)
-			fprintf(output, "%s", prefix);
+			evlist__for_each_entry(evlist, counter) {
+				if (counter->cgrp != os->cgrp)
+					continue;
 
-		id = buf[thread].id;
-		printout(config, id, 0, buf[thread].counter, buf[thread].uval,
-			 prefix, buf[thread].run, buf[thread].ena, 1.0,
-			 &rt_stat);
-		fputc('\n', output);
+				print_counter_aggrdata(config, counter, s, os);
+			}
+			print_metric_end(config, os);
+		}
 	}
-
-	free(buf);
-}
-
-struct caggr_data {
-	double avg, avg_enabled, avg_running;
-};
-
-static void counter_aggr_cb(struct perf_stat_config *config __maybe_unused,
-			    struct evsel *counter, void *data,
-			    bool first __maybe_unused)
-{
-	struct caggr_data *cd = data;
-	struct perf_counts_values *aggr = &counter->counts->aggr;
-
-	cd->avg += aggr->val;
-	cd->avg_enabled += aggr->ena;
-	cd->avg_running += aggr->run;
-}
-
-/*
- * Print out the results of a single counter:
- * aggregated counts in system-wide mode
- */
-static void print_counter_aggr(struct perf_stat_config *config,
-			       struct evsel *counter, char *prefix)
-{
-	bool metric_only = config->metric_only;
-	FILE *output = config->output;
-	double uval;
-	struct caggr_data cd = { .avg = 0.0 };
-
-	if (!collect_data(config, counter, counter_aggr_cb, &cd))
-		return;
-
-	if (prefix && !metric_only)
-		fprintf(output, "%s", prefix);
-
-	uval = cd.avg * counter->scale;
-	printout(config, aggr_cpu_id__empty(), 0, counter, uval, prefix, cd.avg_running,
-		 cd.avg_enabled, cd.avg, &rt_stat);
-	if (!metric_only)
-		fprintf(output, "\n");
-}
-
-static void counter_cb(struct perf_stat_config *config __maybe_unused,
-		       struct evsel *counter, void *data,
-		       bool first __maybe_unused)
-{
-	struct aggr_data *ad = data;
-
-	ad->val += perf_counts(counter->counts, ad->cpu_map_idx, 0)->val;
-	ad->ena += perf_counts(counter->counts, ad->cpu_map_idx, 0)->ena;
-	ad->run += perf_counts(counter->counts, ad->cpu_map_idx, 0)->run;
 }
 
-/*
- * Print out the results of a single counter:
- * does not use aggregated count in system-wide
- */
 static void print_counter(struct perf_stat_config *config,
-			  struct evsel *counter, char *prefix)
+			  struct evsel *counter, struct outstate *os)
 {
-	FILE *output = config->output;
-	u64 ena, run, val;
-	double uval;
-	int idx;
-	struct perf_cpu cpu;
-	struct aggr_cpu_id id;
-
-	perf_cpu_map__for_each_cpu(cpu, idx, evsel__cpus(counter)) {
-		struct aggr_data ad = { .cpu_map_idx = idx };
-
-		if (!collect_data(config, counter, counter_cb, &ad))
-			return;
-		val = ad.val;
-		ena = ad.ena;
-		run = ad.run;
-
-		if (prefix)
-			fprintf(output, "%s", prefix);
+	int s;
 
-		uval = val * counter->scale;
-		id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
-		printout(config, id, 0, counter, uval, prefix,
-			 run, ena, 1.0, &rt_stat);
+	/* AGGR_THREAD doesn't have config->aggr_get_id */
+	if (!config->aggr_map)
+		return;
 
-		fputc('\n', output);
+	for (s = 0; s < config->aggr_map->nr; s++) {
+		print_counter_aggrdata(config, counter, s, os);
 	}
 }
 
 static void print_no_aggr_metric(struct perf_stat_config *config,
 				 struct evlist *evlist,
-				 char *prefix)
+				 struct outstate *os)
 {
 	int all_idx;
 	struct perf_cpu cpu;
@@ -1092,214 +961,241 @@ static void print_no_aggr_metric(struct perf_stat_config *config,
 		evlist__for_each_entry(evlist, counter) {
 			u64 ena, run, val;
 			double uval;
-			struct aggr_cpu_id id;
+			struct perf_stat_evsel *ps = counter->stats;
 			int counter_idx = perf_cpu_map__idx(evsel__cpus(counter), cpu);
 
 			if (counter_idx < 0)
 				continue;
 
-			id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
+			os->evsel = counter;
+			os->id = aggr_cpu_id__cpu(cpu, /*data=*/NULL);
 			if (first) {
-				if (prefix)
-					fputs(prefix, config->output);
-				aggr_printout(config, counter, id, 0);
+				print_metric_begin(config, evlist, os, counter_idx);
 				first = false;
 			}
-			val = perf_counts(counter->counts, counter_idx, 0)->val;
-			ena = perf_counts(counter->counts, counter_idx, 0)->ena;
-			run = perf_counts(counter->counts, counter_idx, 0)->run;
+			val = ps->aggr[counter_idx].counts.val;
+			ena = ps->aggr[counter_idx].counts.ena;
+			run = ps->aggr[counter_idx].counts.run;
 
 			uval = val * counter->scale;
-			printout(config, id, 0, counter, uval, prefix,
-				 run, ena, 1.0, &rt_stat);
+			printout(config, os, uval, run, ena, 1.0, counter_idx);
 		}
 		if (!first)
-			fputc('\n', config->output);
+			print_metric_end(config, os);
 	}
 }
 
-static int aggr_header_lens[] = {
-	[AGGR_CORE] = 24,
-	[AGGR_DIE] = 18,
-	[AGGR_SOCKET] = 12,
-	[AGGR_NONE] = 6,
-	[AGGR_THREAD] = 24,
-	[AGGR_NODE] = 6,
-	[AGGR_GLOBAL] = 0,
-};
+static void print_metric_headers_std(struct perf_stat_config *config,
+				     bool no_indent)
+{
+	fputc(' ', config->output);
 
-static const char *aggr_header_csv[] = {
-	[AGGR_CORE] 	= 	"core,cpus,",
-	[AGGR_DIE] 	= 	"die,cpus",
-	[AGGR_SOCKET] 	= 	"socket,cpus",
-	[AGGR_NONE] 	= 	"cpu,",
-	[AGGR_THREAD] 	= 	"comm-pid,",
-	[AGGR_NODE] 	= 	"node,",
-	[AGGR_GLOBAL] 	=	""
-};
+	if (!no_indent) {
+		int len = aggr_header_lens[config->aggr_mode];
+
+		if (nr_cgroups || config->cgroup_list)
+			len += CGROUP_LEN + 1;
+
+		fprintf(config->output, "%*s", len, "");
+	}
+}
+
+static void print_metric_headers_csv(struct perf_stat_config *config,
+				     bool no_indent __maybe_unused)
+{
+	if (config->interval)
+		fputs("time,", config->output);
+	if (!config->iostat_run)
+		fputs(aggr_header_csv[config->aggr_mode], config->output);
+}
+
+static void print_metric_headers_json(struct perf_stat_config *config __maybe_unused,
+				      bool no_indent __maybe_unused)
+{
+}
 
 static void print_metric_headers(struct perf_stat_config *config,
-				 struct evlist *evlist,
-				 const char *prefix, bool no_indent)
+				 struct evlist *evlist, bool no_indent)
 {
-	struct perf_stat_output_ctx out;
 	struct evsel *counter;
 	struct outstate os = {
 		.fh = config->output
 	};
-	bool first = true;
-
-		if (config->json_output && !config->interval)
-			fprintf(config->output, "{");
+	struct perf_stat_output_ctx out = {
+		.ctx = &os,
+		.print_metric = print_metric_header,
+		.new_line = new_line_metric,
+		.force_header = true,
+	};
 
-	if (prefix && !config->json_output)
-		fprintf(config->output, "%s", prefix);
+	if (config->json_output)
+		print_metric_headers_json(config, no_indent);
+	else if (config->csv_output)
+		print_metric_headers_csv(config, no_indent);
+	else
+		print_metric_headers_std(config, no_indent);
 
-	if (!config->csv_output && !no_indent)
-		fprintf(config->output, "%*s",
-			aggr_header_lens[config->aggr_mode], "");
-	if (config->csv_output) {
-		if (config->interval)
-			fputs("time,", config->output);
-		if (!config->iostat_run)
-			fputs(aggr_header_csv[config->aggr_mode], config->output);
-	}
 	if (config->iostat_run)
 		iostat_print_header_prefix(config);
 
+	if (config->cgroup_list)
+		os.cgrp = evlist__first(evlist)->cgrp;
+
 	/* Print metrics headers only */
 	evlist__for_each_entry(evlist, counter) {
 		os.evsel = counter;
-		out.ctx = &os;
-		out.print_metric = print_metric_header;
-		if (!first && config->json_output)
-			fprintf(config->output, ", ");
-		first = false;
-		out.new_line = new_line_metric;
-		out.force_header = true;
+
 		perf_stat__print_shadow_stats(config, counter, 0,
 					      0,
 					      &out,
 					      &config->metric_events,
 					      &rt_stat);
 	}
+
+	if (!config->json_output)
+		fputc('\n', config->output);
+}
+
+static void prepare_interval(struct perf_stat_config *config,
+			     char *prefix, size_t len, struct timespec *ts)
+{
+	if (config->iostat_run)
+		return;
+
 	if (config->json_output)
-		fprintf(config->output, "}");
-	fputc('\n', config->output);
+		scnprintf(prefix, len, "\"interval\" : %lu.%09lu, ",
+			  (unsigned long) ts->tv_sec, ts->tv_nsec);
+	else if (config->csv_output)
+		scnprintf(prefix, len, "%lu.%09lu%s",
+			  (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep);
+	else
+		scnprintf(prefix, len, "%6lu.%09lu ",
+			  (unsigned long) ts->tv_sec, ts->tv_nsec);
 }
 
-static void print_interval(struct perf_stat_config *config,
-			   struct evlist *evlist,
-			   char *prefix, struct timespec *ts)
+static void print_header_interval_std(struct perf_stat_config *config,
+				      struct target *_target __maybe_unused,
+				      struct evlist *evlist,
+				      int argc __maybe_unused,
+				      const char **argv __maybe_unused)
 {
-	bool metric_only = config->metric_only;
-	unsigned int unit_width = config->unit_width;
 	FILE *output = config->output;
-	static int num_print_interval;
 
-	if (config->interval_clear)
-		puts(CONSOLE_CLEAR);
-
-	if (!config->iostat_run && !config->json_output)
-		sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec,
-				 ts->tv_nsec, config->csv_sep);
-	if (!config->iostat_run && config->json_output && !config->metric_only)
-		sprintf(prefix, "{\"interval\" : %lu.%09lu, ", (unsigned long)
-				 ts->tv_sec, ts->tv_nsec);
-	if (!config->iostat_run && config->json_output && config->metric_only)
-		sprintf(prefix, "{\"interval\" : %lu.%09lu}", (unsigned long)
-				 ts->tv_sec, ts->tv_nsec);
-
-	if ((num_print_interval == 0 && !config->csv_output && !config->json_output)
-			 || config->interval_clear) {
-		switch (config->aggr_mode) {
-		case AGGR_NODE:
-			fprintf(output, "#           time node   cpus");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_SOCKET:
-			fprintf(output, "#           time socket cpus");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_DIE:
-			fprintf(output, "#           time die          cpus");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_CORE:
-			fprintf(output, "#           time core            cpus");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_NONE:
-			fprintf(output, "#           time CPU    ");
-			if (!metric_only)
-				fprintf(output, "                counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_THREAD:
-			fprintf(output, "#           time             comm-pid");
-			if (!metric_only)
-				fprintf(output, "                  counts %*s events\n", unit_width, "unit");
-			break;
-		case AGGR_GLOBAL:
-		default:
-			if (!config->iostat_run) {
-				fprintf(output, "#           time");
-				if (!metric_only)
-					fprintf(output, "             counts %*s events\n", unit_width, "unit");
-			}
-		case AGGR_UNSET:
-		case AGGR_MAX:
-			break;
-		}
+	switch (config->aggr_mode) {
+	case AGGR_NODE:
+	case AGGR_SOCKET:
+	case AGGR_DIE:
+	case AGGR_CORE:
+		fprintf(output, "#%*s %-*s cpus",
+			INTERVAL_LEN - 1, "time",
+			aggr_header_lens[config->aggr_mode],
+			aggr_header_std[config->aggr_mode]);
+		break;
+	case AGGR_NONE:
+		fprintf(output, "#%*s %-*s",
+			INTERVAL_LEN - 1, "time",
+			aggr_header_lens[config->aggr_mode],
+			aggr_header_std[config->aggr_mode]);
+		break;
+	case AGGR_THREAD:
+		fprintf(output, "#%*s %*s-%-*s",
+			INTERVAL_LEN - 1, "time",
+			COMM_LEN, "comm", PID_LEN, "pid");
+		break;
+	case AGGR_GLOBAL:
+	default:
+		if (!config->iostat_run)
+			fprintf(output, "#%*s",
+				INTERVAL_LEN - 1, "time");
+	case AGGR_UNSET:
+	case AGGR_MAX:
+		break;
 	}
 
-	if ((num_print_interval == 0 || config->interval_clear)
-			 && metric_only && !config->json_output)
-		print_metric_headers(config, evlist, " ", true);
-	if ((num_print_interval == 0 || config->interval_clear)
-			 && metric_only && config->json_output) {
-		fprintf(output, "{");
-		print_metric_headers(config, evlist, " ", true);
-	}
-	if (++num_print_interval == 25)
-		num_print_interval = 0;
+	if (config->metric_only)
+		print_metric_headers(config, evlist, true);
+	else
+		fprintf(output, " %*s %*s events\n",
+			COUNTS_LEN, "counts", config->unit_width, "unit");
+}
+
+static void print_header_std(struct perf_stat_config *config,
+			     struct target *_target, struct evlist *evlist,
+			     int argc, const char **argv)
+{
+	FILE *output = config->output;
+	int i;
+
+	fprintf(output, "\n");
+	fprintf(output, " Performance counter stats for ");
+	if (_target->bpf_str)
+		fprintf(output, "\'BPF program(s) %s", _target->bpf_str);
+	else if (_target->system_wide)
+		fprintf(output, "\'system wide");
+	else if (_target->cpu_list)
+		fprintf(output, "\'CPU(s) %s", _target->cpu_list);
+	else if (!target__has_task(_target)) {
+		fprintf(output, "\'%s", argv ? argv[0] : "pipe");
+		for (i = 1; argv && (i < argc); i++)
+			fprintf(output, " %s", argv[i]);
+	} else if (_target->pid)
+		fprintf(output, "process id \'%s", _target->pid);
+	else
+		fprintf(output, "thread id \'%s", _target->tid);
+
+	fprintf(output, "\'");
+	if (config->run_count > 1)
+		fprintf(output, " (%d runs)", config->run_count);
+	fprintf(output, ":\n\n");
+
+	if (config->metric_only)
+		print_metric_headers(config, evlist, false);
+}
+
+static void print_header_csv(struct perf_stat_config *config,
+			     struct target *_target __maybe_unused,
+			     struct evlist *evlist,
+			     int argc __maybe_unused,
+			     const char **argv __maybe_unused)
+{
+	if (config->metric_only)
+		print_metric_headers(config, evlist, true);
+}
+static void print_header_json(struct perf_stat_config *config,
+			      struct target *_target __maybe_unused,
+			      struct evlist *evlist,
+			      int argc __maybe_unused,
+			      const char **argv __maybe_unused)
+{
+	if (config->metric_only)
+		print_metric_headers(config, evlist, true);
 }
 
 static void print_header(struct perf_stat_config *config,
 			 struct target *_target,
+			 struct evlist *evlist,
 			 int argc, const char **argv)
 {
-	FILE *output = config->output;
-	int i;
+	static int num_print_iv;
 
 	fflush(stdout);
 
-	if (!config->csv_output && !config->json_output) {
-		fprintf(output, "\n");
-		fprintf(output, " Performance counter stats for ");
-		if (_target->bpf_str)
-			fprintf(output, "\'BPF program(s) %s", _target->bpf_str);
-		else if (_target->system_wide)
-			fprintf(output, "\'system wide");
-		else if (_target->cpu_list)
-			fprintf(output, "\'CPU(s) %s", _target->cpu_list);
-		else if (!target__has_task(_target)) {
-			fprintf(output, "\'%s", argv ? argv[0] : "pipe");
-			for (i = 1; argv && (i < argc); i++)
-				fprintf(output, " %s", argv[i]);
-		} else if (_target->pid)
-			fprintf(output, "process id \'%s", _target->pid);
-		else
-			fprintf(output, "thread id \'%s", _target->tid);
+	if (config->interval_clear)
+		puts(CONSOLE_CLEAR);
 
-		fprintf(output, "\'");
-		if (config->run_count > 1)
-			fprintf(output, " (%d runs)", config->run_count);
-		fprintf(output, ":\n\n");
+	if (num_print_iv == 0 || config->interval_clear) {
+		if (config->json_output)
+			print_header_json(config, _target, evlist, argc, argv);
+		else if (config->csv_output)
+			print_header_csv(config, _target, evlist, argc, argv);
+		else if (config->interval)
+			print_header_interval_std(config, _target, evlist, argc, argv);
+		else
+			print_header_std(config, _target, evlist, argc, argv);
 	}
+
+	if (num_print_iv++ == 25)
+		num_print_iv = 0;
 }
 
 static int get_precision(double num)
@@ -1348,6 +1244,9 @@ static void print_footer(struct perf_stat_config *config)
 	double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC;
 	FILE *output = config->output;
 
+	if (config->interval || config->csv_output || config->json_output)
+		return;
+
 	if (!config->null_run)
 		fprintf(output, "\n");
 
@@ -1376,7 +1275,7 @@ static void print_footer(struct perf_stat_config *config)
 		fprintf(output, " %17.*f +- %.*f seconds time elapsed",
 			precision, avg, precision, sd);
 
-		print_noise_pct(config, sd, avg);
+		print_noise_pct(config, sd, avg, /*before_metric=*/false);
 	}
 	fprintf(output, "\n\n");
 
@@ -1393,121 +1292,127 @@ static void print_footer(struct perf_stat_config *config)
 			"the same PMU. Try reorganizing the group.\n");
 }
 
-static void print_percore_thread(struct perf_stat_config *config,
-				 struct evsel *counter, char *prefix)
-{
-	int s;
-	struct aggr_cpu_id s2, id;
-	struct perf_cpu_map *cpus;
-	bool first = true;
-	int idx;
-	struct perf_cpu cpu;
-
-	cpus = evsel__cpus(counter);
-	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
-		s2 = config->aggr_get_id(config, cpu);
-		for (s = 0; s < config->aggr_map->nr; s++) {
-			id = config->aggr_map->map[s];
-			if (aggr_cpu_id__equal(&s2, &id))
-				break;
-		}
-
-		print_counter_aggrdata(config, counter, s,
-				       prefix, false,
-				       &first, cpu);
-	}
-}
-
 static void print_percore(struct perf_stat_config *config,
-			  struct evsel *counter, char *prefix)
+			  struct evsel *counter, struct outstate *os)
 {
 	bool metric_only = config->metric_only;
 	FILE *output = config->output;
-	int s;
-	bool first = true;
+	struct cpu_aggr_map *core_map;
+	int s, c, i;
 
 	if (!config->aggr_map || !config->aggr_get_id)
 		return;
 
 	if (config->percore_show_thread)
-		return print_percore_thread(config, counter, prefix);
+		return print_counter(config, counter, os);
 
-	for (s = 0; s < config->aggr_map->nr; s++) {
-		if (prefix && metric_only)
-			fprintf(output, "%s", prefix);
+	core_map = cpu_aggr_map__empty_new(config->aggr_map->nr);
+	if (core_map == NULL) {
+		fprintf(output, "Cannot allocate per-core aggr map for display\n");
+		return;
+	}
+
+	for (s = 0, c = 0; s < config->aggr_map->nr; s++) {
+		struct perf_cpu curr_cpu = config->aggr_map->map[s].cpu;
+		struct aggr_cpu_id core_id = aggr_cpu_id__core(curr_cpu, NULL);
+		bool found = false;
 
-		print_counter_aggrdata(config, counter, s,
-				prefix, metric_only,
-				&first, (struct perf_cpu){ .cpu = -1 });
+		for (i = 0; i < c; i++) {
+			if (aggr_cpu_id__equal(&core_map->map[i], &core_id)) {
+				found = true;
+				break;
+			}
+		}
+		if (found)
+			continue;
+
+		print_counter_aggrdata(config, counter, s, os);
+
+		core_map->map[c++] = core_id;
 	}
+	free(core_map);
 
 	if (metric_only)
 		fputc('\n', output);
 }
 
+static void print_cgroup_counter(struct perf_stat_config *config, struct evlist *evlist,
+				 struct outstate *os)
+{
+	struct evsel *counter;
+
+	evlist__for_each_entry(evlist, counter) {
+		if (os->cgrp != counter->cgrp) {
+			if (os->cgrp != NULL)
+				print_metric_end(config, os);
+
+			os->cgrp = counter->cgrp;
+			print_metric_begin(config, evlist, os, /*aggr_idx=*/0);
+		}
+
+		print_counter(config, counter, os);
+	}
+	if (os->cgrp)
+		print_metric_end(config, os);
+}
+
 void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *config,
-			    struct target *_target, struct timespec *ts, int argc, const char **argv)
+			    struct target *_target, struct timespec *ts,
+			    int argc, const char **argv)
 {
 	bool metric_only = config->metric_only;
 	int interval = config->interval;
 	struct evsel *counter;
-	char buf[64], *prefix = NULL;
+	char buf[64];
+	struct outstate os = {
+		.fh = config->output,
+		.first = true,
+	};
 
 	if (config->iostat_run)
 		evlist->selected = evlist__first(evlist);
 
-	if (interval)
-		print_interval(config, evlist, prefix = buf, ts);
-	else
-		print_header(config, _target, argc, argv);
-
-	if (metric_only) {
-		static int num_print_iv;
-
-		if (num_print_iv == 0 && !interval)
-			print_metric_headers(config, evlist, prefix, false);
-		if (num_print_iv++ == 25)
-			num_print_iv = 0;
-		if (config->aggr_mode == AGGR_GLOBAL && prefix && !config->iostat_run)
-			fprintf(config->output, "%s", prefix);
-
-		if (config->json_output && !config->metric_only)
-			fprintf(config->output, "}");
+	if (interval) {
+		os.prefix = buf;
+		prepare_interval(config, buf, sizeof(buf), ts);
 	}
 
+	print_header(config, _target, evlist, argc, argv);
+
 	switch (config->aggr_mode) {
 	case AGGR_CORE:
 	case AGGR_DIE:
 	case AGGR_SOCKET:
 	case AGGR_NODE:
-		print_aggr(config, evlist, prefix);
+		if (config->cgroup_list)
+			print_aggr_cgroup(config, evlist, &os);
+		else
+			print_aggr(config, evlist, &os);
 		break;
 	case AGGR_THREAD:
-		evlist__for_each_entry(evlist, counter) {
-			print_aggr_thread(config, _target, counter, prefix);
-		}
-		break;
 	case AGGR_GLOBAL:
-		if (config->iostat_run)
-			iostat_print_counters(evlist, config, ts, prefix = buf,
-					      print_counter_aggr);
-		else {
+		if (config->iostat_run) {
+			iostat_print_counters(evlist, config, ts, buf,
+					      (iostat_print_counter_t)print_counter, &os);
+		} else if (config->cgroup_list) {
+			print_cgroup_counter(config, evlist, &os);
+		} else {
+			print_metric_begin(config, evlist, &os, /*aggr_idx=*/0);
 			evlist__for_each_entry(evlist, counter) {
-				print_counter_aggr(config, counter, prefix);
+				print_counter(config, counter, &os);
 			}
-			if (metric_only)
-				fputc('\n', config->output);
+			print_metric_end(config, &os);
 		}
 		break;
 	case AGGR_NONE:
 		if (metric_only)
-			print_no_aggr_metric(config, evlist, prefix);
+			print_no_aggr_metric(config, evlist, &os);
 		else {
 			evlist__for_each_entry(evlist, counter) {
 				if (counter->percore)
-					print_percore(config, counter, prefix);
+					print_percore(config, counter, &os);
 				else
-					print_counter(config, counter, prefix);
+					print_counter(config, counter, &os);
 			}
 		}
 		break;
@@ -1517,8 +1422,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 		break;
 	}
 
-	if (!interval && !config->csv_output && !config->json_output)
-		print_footer(config);
+	print_footer(config);
 
 	fflush(config->output);
 }
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 07b29fe272c7..cadb2df23c87 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -14,6 +14,7 @@
 #include "units.h"
 #include <linux/zalloc.h>
 #include "iostat.h"
+#include "util/hashmap.h"
 
 /*
  * AGGR_GLOBAL: Use CPU 0
@@ -398,7 +399,7 @@ void perf_stat__collect_metric_expr(struct evlist *evsel_list)
 
 		i = 0;
 		hashmap__for_each_entry(ctx->ids, cur, bkt) {
-			const char *metric_name = (const char *)cur->key;
+			const char *metric_name = cur->pkey;
 
 			found = false;
 			if (leader) {
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 8ec8bb4a9912..534d36d26fc3 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -14,11 +14,7 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "thread_map.h"
-#ifdef HAVE_LIBBPF_SUPPORT
-#include <bpf/hashmap.h>
-#else
 #include "util/hashmap.h"
-#endif
 #include <linux/zalloc.h>
 
 void update_stats(struct stats *stats, u64 val)
@@ -130,18 +126,65 @@ static void perf_stat_evsel_id_init(struct evsel *evsel)
 	}
 }
 
+static void evsel__reset_aggr_stats(struct evsel *evsel)
+{
+	struct perf_stat_evsel *ps = evsel->stats;
+	struct perf_stat_aggr *aggr = ps->aggr;
+
+	if (aggr)
+		memset(aggr, 0, sizeof(*aggr) * ps->nr_aggr);
+}
+
 static void evsel__reset_stat_priv(struct evsel *evsel)
 {
 	struct perf_stat_evsel *ps = evsel->stats;
 
 	init_stats(&ps->res_stats);
+	evsel__reset_aggr_stats(evsel);
+}
+
+static int evsel__alloc_aggr_stats(struct evsel *evsel, int nr_aggr)
+{
+	struct perf_stat_evsel *ps = evsel->stats;
+
+	if (ps == NULL)
+		return 0;
+
+	ps->nr_aggr = nr_aggr;
+	ps->aggr = calloc(nr_aggr, sizeof(*ps->aggr));
+	if (ps->aggr == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int evlist__alloc_aggr_stats(struct evlist *evlist, int nr_aggr)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel__alloc_aggr_stats(evsel, nr_aggr) < 0)
+			return -1;
+	}
+	return 0;
 }
 
-static int evsel__alloc_stat_priv(struct evsel *evsel)
+static int evsel__alloc_stat_priv(struct evsel *evsel, int nr_aggr)
 {
-	evsel->stats = zalloc(sizeof(struct perf_stat_evsel));
-	if (evsel->stats == NULL)
+	struct perf_stat_evsel *ps;
+
+	ps = zalloc(sizeof(*ps));
+	if (ps == NULL)
 		return -ENOMEM;
+
+	evsel->stats = ps;
+
+	if (nr_aggr && evsel__alloc_aggr_stats(evsel, nr_aggr) < 0) {
+		evsel->stats = NULL;
+		free(ps);
+		return -ENOMEM;
+	}
+
 	perf_stat_evsel_id_init(evsel);
 	evsel__reset_stat_priv(evsel);
 	return 0;
@@ -151,8 +194,10 @@ static void evsel__free_stat_priv(struct evsel *evsel)
 {
 	struct perf_stat_evsel *ps = evsel->stats;
 
-	if (ps)
+	if (ps) {
+		zfree(&ps->aggr);
 		zfree(&ps->group_data);
+	}
 	zfree(&evsel->stats);
 }
 
@@ -181,9 +226,9 @@ static void evsel__reset_prev_raw_counts(struct evsel *evsel)
 		perf_counts__reset(evsel->prev_raw_counts);
 }
 
-static int evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
+static int evsel__alloc_stats(struct evsel *evsel, int nr_aggr, bool alloc_raw)
 {
-	if (evsel__alloc_stat_priv(evsel) < 0 ||
+	if (evsel__alloc_stat_priv(evsel, nr_aggr) < 0 ||
 	    evsel__alloc_counts(evsel) < 0 ||
 	    (alloc_raw && evsel__alloc_prev_raw_counts(evsel) < 0))
 		return -ENOMEM;
@@ -191,12 +236,17 @@ static int evsel__alloc_stats(struct evsel *evsel, bool alloc_raw)
 	return 0;
 }
 
-int evlist__alloc_stats(struct evlist *evlist, bool alloc_raw)
+int evlist__alloc_stats(struct perf_stat_config *config,
+			struct evlist *evlist, bool alloc_raw)
 {
 	struct evsel *evsel;
+	int nr_aggr = 0;
+
+	if (config && config->aggr_map)
+		nr_aggr = config->aggr_map->nr;
 
 	evlist__for_each_entry(evlist, evsel) {
-		if (evsel__alloc_stats(evsel, alloc_raw))
+		if (evsel__alloc_stats(evsel, nr_aggr, alloc_raw))
 			goto out_free;
 	}
 
@@ -228,6 +278,14 @@ void evlist__reset_stats(struct evlist *evlist)
 	}
 }
 
+void evlist__reset_aggr_stats(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel)
+		evsel__reset_aggr_stats(evsel);
+}
+
 void evlist__reset_prev_raw_counts(struct evlist *evlist)
 {
 	struct evsel *evsel;
@@ -246,8 +304,6 @@ static void evsel__copy_prev_raw_counts(struct evsel *evsel)
 				*perf_counts(evsel->prev_raw_counts, idx, thread);
 		}
 	}
-
-	evsel->counts->aggr = evsel->prev_raw_counts->aggr;
 }
 
 void evlist__copy_prev_raw_counts(struct evlist *evlist)
@@ -258,35 +314,14 @@ void evlist__copy_prev_raw_counts(struct evlist *evlist)
 		evsel__copy_prev_raw_counts(evsel);
 }
 
-void evlist__save_aggr_prev_raw_counts(struct evlist *evlist)
-{
-	struct evsel *evsel;
-
-	/*
-	 * To collect the overall statistics for interval mode,
-	 * we copy the counts from evsel->prev_raw_counts to
-	 * evsel->counts. The perf_stat_process_counter creates
-	 * aggr values from per cpu values, but the per cpu values
-	 * are 0 for AGGR_GLOBAL. So we use a trick that saves the
-	 * previous aggr value to the first member of perf_counts,
-	 * then aggr calculation in process_counter_values can work
-	 * correctly.
-	 */
-	evlist__for_each_entry(evlist, evsel) {
-		*perf_counts(evsel->prev_raw_counts, 0, 0) =
-			evsel->prev_raw_counts->aggr;
-	}
-}
-
-static size_t pkg_id_hash(const void *__key, void *ctx __maybe_unused)
+static size_t pkg_id_hash(long __key, void *ctx __maybe_unused)
 {
 	uint64_t *key = (uint64_t *) __key;
 
 	return *key & 0xffffffff;
 }
 
-static bool pkg_id_equal(const void *__key1, const void *__key2,
-			 void *ctx __maybe_unused)
+static bool pkg_id_equal(long __key1, long __key2, void *ctx __maybe_unused)
 {
 	uint64_t *key1 = (uint64_t *) __key1;
 	uint64_t *key2 = (uint64_t *) __key2;
@@ -347,21 +382,40 @@ static int check_per_pkg(struct evsel *counter, struct perf_counts_values *vals,
 		return -ENOMEM;
 
 	*key = (uint64_t)d << 32 | s;
-	if (hashmap__find(mask, (void *)key, NULL)) {
+	if (hashmap__find(mask, key, NULL)) {
 		*skip = true;
 		free(key);
 	} else
-		ret = hashmap__add(mask, (void *)key, (void *)1);
+		ret = hashmap__add(mask, key, 1);
 
 	return ret;
 }
 
+static bool evsel__count_has_error(struct evsel *evsel,
+				   struct perf_counts_values *count,
+				   struct perf_stat_config *config)
+{
+	/* the evsel was failed already */
+	if (evsel->err || evsel->counts->scaled == -1)
+		return true;
+
+	/* this is meaningful for CPU aggregation modes only */
+	if (config->aggr_mode == AGGR_GLOBAL)
+		return false;
+
+	/* it's considered ok when it actually ran */
+	if (count->ena != 0 && count->run != 0)
+		return false;
+
+	return true;
+}
+
 static int
 process_counter_values(struct perf_stat_config *config, struct evsel *evsel,
 		       int cpu_map_idx, int thread,
 		       struct perf_counts_values *count)
 {
-	struct perf_counts_values *aggr = &evsel->counts->aggr;
+	struct perf_stat_evsel *ps = evsel->stats;
 	static struct perf_counts_values zero;
 	bool skip = false;
 
@@ -373,34 +427,60 @@ process_counter_values(struct perf_stat_config *config, struct evsel *evsel,
 	if (skip)
 		count = &zero;
 
-	switch (config->aggr_mode) {
-	case AGGR_THREAD:
-	case AGGR_CORE:
-	case AGGR_DIE:
-	case AGGR_SOCKET:
-	case AGGR_NODE:
-	case AGGR_NONE:
-		if (!evsel->snapshot)
-			evsel__compute_deltas(evsel, cpu_map_idx, thread, count);
-		perf_counts_values__scale(count, config->scale, NULL);
-		if ((config->aggr_mode == AGGR_NONE) && (!evsel->percore)) {
-			perf_stat__update_shadow_stats(evsel, count->val,
-						       cpu_map_idx, &rt_stat);
-		}
+	if (!evsel->snapshot)
+		evsel__compute_deltas(evsel, cpu_map_idx, thread, count);
+	perf_counts_values__scale(count, config->scale, NULL);
+
+	if (config->aggr_mode == AGGR_THREAD) {
+		struct perf_counts_values *aggr_counts = &ps->aggr[thread].counts;
+
+		/*
+		 * Skip value 0 when enabling --per-thread globally,
+		 * otherwise too many 0 output.
+		 */
+		if (count->val == 0 && config->system_wide)
+			return 0;
+
+		ps->aggr[thread].nr++;
+
+		aggr_counts->val += count->val;
+		aggr_counts->ena += count->ena;
+		aggr_counts->run += count->run;
+		return 0;
+	}
 
-		if (config->aggr_mode == AGGR_THREAD) {
-			perf_stat__update_shadow_stats(evsel, count->val,
-						       thread, &rt_stat);
+	if (ps->aggr) {
+		struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx);
+		struct aggr_cpu_id aggr_id = config->aggr_get_id(config, cpu);
+		struct perf_stat_aggr *ps_aggr;
+		int i;
+
+		for (i = 0; i < ps->nr_aggr; i++) {
+			if (!aggr_cpu_id__equal(&aggr_id, &config->aggr_map->map[i]))
+				continue;
+
+			ps_aggr = &ps->aggr[i];
+			ps_aggr->nr++;
+
+			/*
+			 * When any result is bad, make them all to give consistent output
+			 * in interval mode.  But per-task counters can have 0 enabled time
+			 * when some tasks are idle.
+			 */
+			if (evsel__count_has_error(evsel, count, config) && !ps_aggr->failed) {
+				ps_aggr->counts.val = 0;
+				ps_aggr->counts.ena = 0;
+				ps_aggr->counts.run = 0;
+				ps_aggr->failed = true;
+			}
+
+			if (!ps_aggr->failed) {
+				ps_aggr->counts.val += count->val;
+				ps_aggr->counts.ena += count->ena;
+				ps_aggr->counts.run += count->run;
+			}
+			break;
 		}
-		break;
-	case AGGR_GLOBAL:
-		aggr->val += count->val;
-		aggr->ena += count->ena;
-		aggr->run += count->run;
-	case AGGR_UNSET:
-	case AGGR_MAX:
-	default:
-		break;
 	}
 
 	return 0;
@@ -427,13 +507,10 @@ static int process_counter_maps(struct perf_stat_config *config,
 int perf_stat_process_counter(struct perf_stat_config *config,
 			      struct evsel *counter)
 {
-	struct perf_counts_values *aggr = &counter->counts->aggr;
 	struct perf_stat_evsel *ps = counter->stats;
-	u64 *count = counter->counts->aggr.values;
+	u64 *count;
 	int ret;
 
-	aggr->val = aggr->ena = aggr->run = 0;
-
 	if (counter->per_pkg)
 		evsel__zero_per_pkg(counter);
 
@@ -444,10 +521,11 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 	if (config->aggr_mode != AGGR_GLOBAL)
 		return 0;
 
-	if (!counter->snapshot)
-		evsel__compute_deltas(counter, -1, -1, aggr);
-	perf_counts_values__scale(aggr, config->scale, &counter->counts->scaled);
-
+	/*
+	 * GLOBAL aggregation mode only has a single aggr counts,
+	 * so we can use ps->aggr[0] as the actual output.
+	 */
+	count = ps->aggr[0].counts.values;
 	update_stats(&ps->res_stats, *count);
 
 	if (verbose > 0) {
@@ -455,13 +533,194 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 			evsel__name(counter), count[0], count[1], count[2]);
 	}
 
-	/*
-	 * Save the full runtime - to allow normalization during printout:
-	 */
-	perf_stat__update_shadow_stats(counter, *count, 0, &rt_stat);
+	return 0;
+}
+
+static int evsel__merge_aggr_counters(struct evsel *evsel, struct evsel *alias)
+{
+	struct perf_stat_evsel *ps_a = evsel->stats;
+	struct perf_stat_evsel *ps_b = alias->stats;
+	int i;
+
+	if (ps_a->aggr == NULL && ps_b->aggr == NULL)
+		return 0;
+
+	if (ps_a->nr_aggr != ps_b->nr_aggr) {
+		pr_err("Unmatched aggregation mode between aliases\n");
+		return -1;
+	}
+
+	for (i = 0; i < ps_a->nr_aggr; i++) {
+		struct perf_counts_values *aggr_counts_a = &ps_a->aggr[i].counts;
+		struct perf_counts_values *aggr_counts_b = &ps_b->aggr[i].counts;
+
+		/* NB: don't increase aggr.nr for aliases */
+
+		aggr_counts_a->val += aggr_counts_b->val;
+		aggr_counts_a->ena += aggr_counts_b->ena;
+		aggr_counts_a->run += aggr_counts_b->run;
+	}
 
 	return 0;
 }
+/* events should have the same name, scale, unit, cgroup but on different PMUs */
+static bool evsel__is_alias(struct evsel *evsel_a, struct evsel *evsel_b)
+{
+	if (strcmp(evsel__name(evsel_a), evsel__name(evsel_b)))
+		return false;
+
+	if (evsel_a->scale != evsel_b->scale)
+		return false;
+
+	if (evsel_a->cgrp != evsel_b->cgrp)
+		return false;
+
+	if (strcmp(evsel_a->unit, evsel_b->unit))
+		return false;
+
+	if (evsel__is_clock(evsel_a) != evsel__is_clock(evsel_b))
+		return false;
+
+	return !!strcmp(evsel_a->pmu_name, evsel_b->pmu_name);
+}
+
+static void evsel__merge_aliases(struct evsel *evsel)
+{
+	struct evlist *evlist = evsel->evlist;
+	struct evsel *alias;
+
+	alias = list_prepare_entry(evsel, &(evlist->core.entries), core.node);
+	list_for_each_entry_continue(alias, &evlist->core.entries, core.node) {
+		/* Merge the same events on different PMUs. */
+		if (evsel__is_alias(evsel, alias)) {
+			evsel__merge_aggr_counters(evsel, alias);
+			alias->merged_stat = true;
+		}
+	}
+}
+
+static bool evsel__should_merge_hybrid(const struct evsel *evsel,
+				       const struct perf_stat_config *config)
+{
+	return config->hybrid_merge && evsel__is_hybrid(evsel);
+}
+
+static void evsel__merge_stats(struct evsel *evsel, struct perf_stat_config *config)
+{
+	/* this evsel is already merged */
+	if (evsel->merged_stat)
+		return;
+
+	if (evsel->auto_merge_stats || evsel__should_merge_hybrid(evsel, config))
+		evsel__merge_aliases(evsel);
+}
+
+/* merge the same uncore and hybrid events if requested */
+void perf_stat_merge_counters(struct perf_stat_config *config, struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	if (config->no_merge)
+		return;
+
+	evlist__for_each_entry(evlist, evsel)
+		evsel__merge_stats(evsel, config);
+}
+
+static void evsel__update_percore_stats(struct evsel *evsel, struct aggr_cpu_id *core_id)
+{
+	struct perf_stat_evsel *ps = evsel->stats;
+	struct perf_counts_values counts = { 0, };
+	struct aggr_cpu_id id;
+	struct perf_cpu cpu;
+	int idx;
+
+	/* collect per-core counts */
+	perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus) {
+		struct perf_stat_aggr *aggr = &ps->aggr[idx];
+
+		id = aggr_cpu_id__core(cpu, NULL);
+		if (!aggr_cpu_id__equal(core_id, &id))
+			continue;
+
+		counts.val += aggr->counts.val;
+		counts.ena += aggr->counts.ena;
+		counts.run += aggr->counts.run;
+	}
+
+	/* update aggregated per-core counts for each CPU */
+	perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus) {
+		struct perf_stat_aggr *aggr = &ps->aggr[idx];
+
+		id = aggr_cpu_id__core(cpu, NULL);
+		if (!aggr_cpu_id__equal(core_id, &id))
+			continue;
+
+		aggr->counts.val = counts.val;
+		aggr->counts.ena = counts.ena;
+		aggr->counts.run = counts.run;
+
+		aggr->used = true;
+	}
+}
+
+/* we have an aggr_map for cpu, but want to aggregate the counters per-core */
+static void evsel__process_percore(struct evsel *evsel)
+{
+	struct perf_stat_evsel *ps = evsel->stats;
+	struct aggr_cpu_id core_id;
+	struct perf_cpu cpu;
+	int idx;
+
+	if (!evsel->percore)
+		return;
+
+	perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus) {
+		struct perf_stat_aggr *aggr = &ps->aggr[idx];
+
+		if (aggr->used)
+			continue;
+
+		core_id = aggr_cpu_id__core(cpu, NULL);
+		evsel__update_percore_stats(evsel, &core_id);
+	}
+}
+
+/* process cpu stats on per-core events */
+void perf_stat_process_percore(struct perf_stat_config *config, struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	if (config->aggr_mode != AGGR_NONE)
+		return;
+
+	evlist__for_each_entry(evlist, evsel)
+		evsel__process_percore(evsel);
+}
+
+static void evsel__update_shadow_stats(struct evsel *evsel)
+{
+	struct perf_stat_evsel *ps = evsel->stats;
+	int i;
+
+	if (ps->aggr == NULL)
+		return;
+
+	for (i = 0; i < ps->nr_aggr; i++) {
+		struct perf_counts_values *aggr_counts = &ps->aggr[i].counts;
+
+		perf_stat__update_shadow_stats(evsel, aggr_counts->val, i, &rt_stat);
+	}
+}
+
+void perf_stat_process_shadow_stats(struct perf_stat_config *config __maybe_unused,
+				    struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel)
+		evsel__update_shadow_stats(evsel);
+}
 
 int perf_event__process_stat_event(struct perf_session *session,
 				   union perf_event *event)
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index b0899c6e002f..499c3bf81333 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -8,6 +8,7 @@
 #include <sys/resource.h>
 #include "cpumap.h"
 #include "rblist.h"
+#include "counts.h"
 
 struct perf_cpu_map;
 struct perf_stat_config;
@@ -42,9 +43,29 @@ enum perf_stat_evsel_id {
 	PERF_STAT_EVSEL_ID__MAX,
 };
 
+/* hold aggregated event info */
+struct perf_stat_aggr {
+	/* aggregated values */
+	struct perf_counts_values	counts;
+	/* number of entries (CPUs) aggregated */
+	int				nr;
+	/* whether any entry has failed to read/process event */
+	bool				failed;
+	/* to mark this data is processed already */
+	bool				used;
+};
+
+/* per-evsel event stats */
 struct perf_stat_evsel {
+	/* used for repeated runs */
 	struct stats		 res_stats;
+	/* evsel id for quick check */
 	enum perf_stat_evsel_id	 id;
+	/* number of allocated 'aggr' */
+	int			 nr_aggr;
+	/* aggregated event values */
+	struct perf_stat_aggr	*aggr;
+	/* used for group read */
 	u64			*group_data;
 };
 
@@ -139,7 +160,6 @@ struct perf_stat_config {
 	bool			 metric_no_group;
 	bool			 metric_no_merge;
 	bool			 stop_read_counter;
-	bool			 quiet;
 	bool			 iostat_run;
 	char			 *user_requested_cpu_list;
 	bool			 system_wide;
@@ -203,15 +223,6 @@ static inline void update_rusage_stats(struct rusage_stats *ru_stats, struct rus
 struct evsel;
 struct evlist;
 
-struct perf_aggr_thread_value {
-	struct evsel *counter;
-	struct aggr_cpu_id id;
-	double uval;
-	u64 val;
-	u64 run;
-	u64 ena;
-};
-
 bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id);
 
 #define perf_stat_evsel__is(evsel, id) \
@@ -248,15 +259,23 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 				   struct runtime_stat *st);
 void perf_stat__collect_metric_expr(struct evlist *);
 
-int evlist__alloc_stats(struct evlist *evlist, bool alloc_raw);
+int evlist__alloc_stats(struct perf_stat_config *config,
+			struct evlist *evlist, bool alloc_raw);
 void evlist__free_stats(struct evlist *evlist);
 void evlist__reset_stats(struct evlist *evlist);
 void evlist__reset_prev_raw_counts(struct evlist *evlist);
 void evlist__copy_prev_raw_counts(struct evlist *evlist);
 void evlist__save_aggr_prev_raw_counts(struct evlist *evlist);
 
+int evlist__alloc_aggr_stats(struct evlist *evlist, int nr_aggr);
+void evlist__reset_aggr_stats(struct evlist *evlist);
+
 int perf_stat_process_counter(struct perf_stat_config *config,
 			      struct evsel *counter);
+void perf_stat_merge_counters(struct perf_stat_config *config, struct evlist *evlist);
+void perf_stat_process_percore(struct perf_stat_config *config, struct evlist *evlist);
+void perf_stat_process_shadow_stats(struct perf_stat_config *config, struct evlist *evlist);
+
 struct perf_tool;
 union perf_event;
 struct perf_session;
diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index 1e0c731fc539..5c62d3118c41 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c
@@ -741,7 +741,7 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus)
 			break;
 		}
 
-		set_bit(c.cpu, cpumask_bits(b));
+		__set_bit(c.cpu, cpumask_bits(b));
 	}
 
 	perf_cpu_map__put(m);
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 647b7dff8ef3..80345695b136 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1303,7 +1303,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
 			   (!used_opd && syms_ss->adjust_symbols)) {
 			GElf_Phdr phdr;
 
-			if (elf_read_program_header(syms_ss->elf,
+			if (elf_read_program_header(runtime_ss->elf,
 						    (u64)sym.st_value, &phdr)) {
 				pr_debug4("%s: failed to find program header for "
 					   "symbol: %s st_value: %#" PRIx64 "\n",
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 0b893dcc8ea6..e297de14184c 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -132,6 +132,8 @@ struct addr_location {
 	s32	      socket;
 };
 
+void addr_location__put(struct addr_location *al);
+
 int dso__load(struct dso *dso, struct map *map);
 int dso__load_vmlinux(struct dso *dso, struct map *map,
 		      const char *vmlinux, bool vmlinux_allocated);
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index cccd293b5312..3ab6a92b1a6d 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2157,6 +2157,7 @@ int perf_event__synthesize_attr(struct perf_tool *tool, struct perf_event_attr *
 	return err;
 }
 
+#ifdef HAVE_LIBTRACEEVENT
 int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct evlist *evlist,
 					perf_event__handler_t process)
 {
@@ -2203,6 +2204,7 @@ int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, struct e
 
 	return aligned_size;
 }
+#endif
 
 int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16 misc,
 				    perf_event__handler_t process, struct machine *machine)
@@ -2218,8 +2220,9 @@ int perf_event__synthesize_build_id(struct perf_tool *tool, struct dso *pos, u16
 	len = pos->long_name_len + 1;
 	len = PERF_ALIGN(len, NAME_ALIGN);
 	memcpy(&ev.build_id.build_id, pos->bid.data, sizeof(pos->bid.data));
+	ev.build_id.size = pos->bid.size;
 	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
-	ev.build_id.header.misc = misc;
+	ev.build_id.header.misc = misc | PERF_RECORD_MISC_BUILD_ID_SIZE;
 	ev.build_id.pid = machine->pid;
 	ev.build_id.header.size = sizeof(ev.build_id) + len;
 	memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
@@ -2354,6 +2357,7 @@ int perf_event__synthesize_for_pipe(struct perf_tool *tool,
 	}
 	ret += err;
 
+#ifdef HAVE_LIBTRACEEVENT
 	if (have_tracepoints(&evlist->core.entries)) {
 		int fd = perf_data__fd(data);
 
@@ -2373,6 +2377,9 @@ int perf_event__synthesize_for_pipe(struct perf_tool *tool,
 		}
 		ret += err;
 	}
+#else
+	(void)data;
+#endif
 
 	return ret;
 }
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 241f300d7d6e..395c626699a9 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -158,4 +158,7 @@ static inline bool thread__is_filtered(struct thread *thread)
 
 void thread__free_stitch_list(struct thread *thread);
 
+void thread__resolve(struct thread *thread, struct addr_location *al,
+		     struct perf_sample *sample);
+
 #endif	/* __PERF_THREAD_H */
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index c9bfe4696943..e848579e61a8 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -18,6 +18,7 @@
 #include "thread_map.h"
 #include "debug.h"
 #include "event.h"
+#include <internal/threadmap.h>
 
 /* Skip "." and ".." directories */
 static int filter(const struct dirent *dir)
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
index 3bb860a32b8e..00ec05fc1656 100644
--- a/tools/perf/util/thread_map.h
+++ b/tools/perf/util/thread_map.h
@@ -4,8 +4,6 @@
 
 #include <sys/types.h>
 #include <stdio.h>
-#include <linux/refcount.h>
-#include <internal/threadmap.h>
 #include <perf/threadmap.h>
 
 struct perf_record_thread_map;
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 892c323b4ac9..c24b3a15e319 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -26,6 +26,7 @@
 #include <api/fs/tracing_path.h>
 #include "evsel.h"
 #include "debug.h"
+#include "util.h"
 
 #define VERSION "0.6"
 #define MAX_EVENT_LENGTH 512
@@ -38,15 +39,6 @@ struct tracepoint_path {
 	struct tracepoint_path *next;
 };
 
-int bigendian(void)
-{
-	unsigned char str[] = { 0x1, 0x2, 0x3, 0x4, 0x0, 0x0, 0x0, 0x0};
-	unsigned int *ptr;
-
-	ptr = (unsigned int *)(void *)str;
-	return *ptr == 0x01020304;
-}
-
 /* unfortunately, you can not stat debugfs or proc files for size */
 static int record_file(const char *file, ssize_t hdr_sz)
 {
@@ -79,7 +71,7 @@ static int record_file(const char *file, ssize_t hdr_sz)
 
 	/* ugh, handle big-endian hdr_size == 4 */
 	sizep = (char*)&size;
-	if (bigendian())
+	if (host_is_bigendian())
 		sizep += sizeof(u64) - hdr_sz;
 
 	if (hdr_sz && pwrite(output_fd, sizep, hdr_sz, hdr_pos) < 0) {
@@ -564,7 +556,7 @@ static int tracing_data_header(void)
 		return -1;
 
 	/* save endian */
-	if (bigendian())
+	if (host_is_bigendian())
 		buf[0] = 1;
 	else
 		buf[0] = 0;
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index c9c83a40647c..2d3c2576bab7 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -11,6 +11,8 @@
 #include "trace-event.h"
 
 #include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <traceevent/event-parse.h>
 
 static int get_common_field(struct scripting_context *context,
 			    int *offset, int *size, const char *type)
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 8a01af783310..1162c49b8082 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -11,12 +11,14 @@
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <sys/mman.h>
+#include <traceevent/event-parse.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 
 #include "trace-event.h"
 #include "debug.h"
+#include "util.h"
 
 static int input_fd;
 
@@ -414,7 +416,7 @@ ssize_t trace_report(int fd, struct trace_event *tevent, bool __repipe)
 		return -1;
 	}
 	file_bigendian = buf[0];
-	host_bigendian = bigendian();
+	host_bigendian = host_is_bigendian() ? 1 : 0;
 
 	if (trace_event__init(tevent)) {
 		pr_debug("trace_event__init failed");
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 7172ca05265f..56175c53f9af 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -9,12 +9,13 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <traceevent/event-parse.h>
 
 #include "debug.h"
 #include "trace-event.h"
-#include "event.h"
 #include "evsel.h"
 #include <linux/zalloc.h>
+#include "util/sample.h"
 
 struct scripting_context *scripting_context;
 
diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c
index b3ee651e3d91..8ad75b31e09b 100644
--- a/tools/perf/util/trace-event.c
+++ b/tools/perf/util/trace-event.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 640981105788..add6c5d9531c 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -2,9 +2,11 @@
 #ifndef _PERF_UTIL_TRACE_EVENT_H
 #define _PERF_UTIL_TRACE_EVENT_H
 
-#include <traceevent/event-parse.h>
-#include "parse-events.h"
+#include <stdbool.h>
+#include <stdio.h>
+#include <linux/types.h>
 
+struct evlist;
 struct machine;
 struct perf_sample;
 union perf_event;
@@ -18,6 +20,11 @@ struct trace_event {
 	struct tep_plugin_list	*plugin_list;
 };
 
+typedef char *(tep_func_resolver_t)(void *priv,
+				    unsigned long long *addrp, char **modp);
+
+bool have_tracepoints(struct list_head *evlist);
+
 int trace_event__init(struct trace_event *t);
 void trace_event__cleanup(struct trace_event *t);
 int trace_event__register_resolver(struct machine *machine,
@@ -27,8 +34,6 @@ trace_event__tp_format(const char *sys, const char *name);
 
 struct tep_event *trace_event__tp_format_id(int id);
 
-int bigendian(void);
-
 void event_format__fprintf(struct tep_event *event,
 			   int cpu, void *data, int size, FILE *fp);
 
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index c1f2d423a9ec..1d3b300af5a1 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef GIT_COMPAT_UTIL_H
-#define GIT_COMPAT_UTIL_H
+#ifndef __PERF_UTIL_H
+#define __PERF_UTIL_H
 
 #define _BSD_SOURCE 1
 /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */
@@ -94,4 +94,23 @@ int do_realloc_array_as_needed(void **arr, size_t *arr_sz, size_t x,
 		0;						\
 	})
 
-#endif /* GIT_COMPAT_UTIL_H */
+static inline bool host_is_bigendian(void)
+{
+#ifdef __BYTE_ORDER__
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	return false;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	return true;
+#else
+#error "Unrecognized __BYTE_ORDER__"
+#endif
+#else /* !__BYTE_ORDER__ */
+	unsigned char str[] = { 0x1, 0x2, 0x3, 0x4, 0x0, 0x0, 0x0, 0x0};
+	unsigned int *ptr;
+
+	ptr = (unsigned int *)(void *)str;
+	return *ptr == 0x01020304;
+#endif
+}
+
+#endif /* __PERF_UTIL_H */